io.deephaven.csv.reading.ParseDenseStorageToColumn Maven / Gradle / Ivy

Go to download
package io.deephaven.csv.reading;

import io.deephaven.csv.densestorage.DenseStorageReader;
import io.deephaven.csv.parsers.*;
import io.deephaven.csv.sinks.Sink;
import io.deephaven.csv.sinks.SinkFactory;
import io.deephaven.csv.tokenization.Tokenizer;
import io.deephaven.csv.util.CsvReaderException;
import java.util.*;

import io.deephaven.csv.util.MutableBoolean;
import io.deephaven.csv.util.MutableDouble;
import io.deephaven.csv.util.MutableLong;
import org.jetbrains.annotations.NotNull;

/**
 * The job of this class is to take a column of cell text, as prepared by {@link ParseInputToDenseStorage}, do type
 * inference if appropriate, and parse the text into typed data.
 */
public final class ParseDenseStorageToColumn {
    /**
     * @param dsr A reader for the input.
     * @param dsrAlt A second reader for the same input (used to perform the second pass over the data, if type
     *        inference deems a second pass to be necessary).
     * @param parsers The set of parsers to try. If null, then {@link Parsers#DEFAULT} will be used.
     * @param nullValueLiteral If a cell text is equal to this value, it will be interpreted as the null value.
     *        Typically set to the empty string.
     * @param nullParser The Parser to use if {@code parsers.size() > 1} but the column contains all null values. This
     *        is needed as a backstop because otherwise type inference would have no way to choose among the multiple
     *        parsers.
     * @param sinkFactory Factory that makes all of the Sinks of various types, used to consume the data we produce.
     * @return The {@link Sink}, provided by the caller's {@link SinkFactory}, that was selected to hold the column
     *         data.
     */
    public static Sink doit(
            final DenseStorageReader dsr,
            final DenseStorageReader dsrAlt,
            List> parsers,
            final Parser nullParser,
            final Tokenizer.CustomTimeZoneParser customTimeZoneParser,
            final String nullValueLiteral,
            final SinkFactory sinkFactory)
            throws CsvReaderException {
        Set> parserSet = new HashSet<>(Objects.requireNonNullElse(parsers, Parsers.DEFAULT));

        final Tokenizer tokenizer = new Tokenizer(customTimeZoneParser);
        final Parser.GlobalContext gctx =
                new Parser.GlobalContext(tokenizer, sinkFactory, nullValueLiteral);

        // Skip over leading null cells. There are three cases:
        // 1. There is a non-null cell (so the type inference process can begin)
        // 2. The column is full of all nulls
        // 3. The column is empty
        final IteratorHolder ih = new IteratorHolder(dsr);
        boolean columnIsEmpty = true;
        boolean columnIsAllNulls = true;
        while (ih.tryMoveNext()) {
            columnIsEmpty = false;
            if (!gctx.isNullCell(ih)) {
                columnIsAllNulls = false;
                break;
            }
        }

        if (columnIsAllNulls) {
            // We get here in cases 2 and 3: the column is all nulls, or the column is empty.
            final Parser nullParserToUse =
                    parserSet.size() == 1 ? parserSet.iterator().next() : nullParser;
            if (nullParserToUse == null) {
                throw new CsvReaderException(
                        "Column contains all null cells: can't infer type of column, and nullParser is not set.");
            }
            if (columnIsEmpty) {
                return emptyParse(nullParserToUse, gctx);
            }
            return onePhaseParse(nullParserToUse, gctx, dsrAlt);
        }

        final CategorizedParsers cats = CategorizedParsers.create(parserSet);

        if (cats.customParser != null) {
            return onePhaseParse(cats.customParser, gctx, dsrAlt);
        }

        // Numerics are special and they get their own fast path that uses Sources and Sinks rather than
        // reparsing the text input.
        final MutableDouble dummyDouble = new MutableDouble();
        if (!cats.numericParsers.isEmpty() && tokenizer.tryParseDouble(ih.bs(), dummyDouble)) {
            return parseNumerics(cats, gctx, ih, dsrAlt);
        }

        List> universeByPrecedence = List.of(Parsers.CHAR, Parsers.STRING);
        final MutableBoolean dummyBoolean = new MutableBoolean();
        final MutableLong dummyLong = new MutableLong();
        if (cats.timestampParser != null && tokenizer.tryParseLong(ih.bs(), dummyLong)) {
            universeByPrecedence = List.of(cats.timestampParser, Parsers.CHAR, Parsers.STRING);
        } else if (cats.booleanParser != null && tokenizer.tryParseBoolean(ih.bs(), dummyBoolean)) {
            universeByPrecedence = List.of(Parsers.BOOLEAN, Parsers.STRING);
        } else if (cats.dateTimeParser != null && tokenizer.tryParseDateTime(ih.bs(), dummyLong)) {
            universeByPrecedence = List.of(Parsers.DATETIME, Parsers.STRING);
        }
        List> parsersToUse = limitToSpecified(universeByPrecedence, parserSet);
        return parseFromList(parsersToUse, gctx, ih, dsrAlt);
    }

    @NotNull
    private static Sink parseNumerics(
            CategorizedParsers cats,
            final Parser.GlobalContext gctx,
            final IteratorHolder ih,
            final DenseStorageReader dsrAlt)
            throws CsvReaderException {
        final List wrappers = new ArrayList<>();
        for (Parser parser : cats.numericParsers) {
            final ParserResultWrapper prw = parseNumericsHelper(parser, gctx, ih);
            wrappers.add(prw);
            if (ih.isExhausted()) {
                // Parsed everything with numerics!
                return unifyNumericResults(gctx, wrappers);
            }
        }

        return parseFromList(cats.charAndStringParsers, gctx, ih, dsrAlt);
    }

    @NotNull
    private static  ParserResultWrapper parseNumericsHelper(
            Parser parser, final Parser.GlobalContext gctx, final IteratorHolder ih)
            throws CsvReaderException {
        final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE);
        final long begin = ih.numConsumed() - 1;
        final long end = parser.tryParse(gctx, pctx, ih, begin, Long.MAX_VALUE, true);
        return new ParserResultWrapper(pctx, begin, end);
    }

    @NotNull
    private static Sink parseFromList(
            final List> parsers,
            final Parser.GlobalContext gctx,
            final IteratorHolder ih,
            final DenseStorageReader dsrAlt)
            throws CsvReaderException {
        if (parsers.isEmpty()) {
            throw new CsvReaderException("No available parsers.");
        }

        for (int ii = 0; ii < parsers.size() - 1; ++ii) {
            final Sink result = tryTwoPhaseParse(parsers.get(ii), gctx, ih, dsrAlt);
            if (result != null) {
                return result;
            }
        }

        // The final parser in the set gets special (more efficient) handling because there's nothing to
        // fall back to.
        return onePhaseParse(parsers.get(parsers.size() - 1), gctx, dsrAlt);
    }

    private static  Sink tryTwoPhaseParse(
            final Parser parser,
            final Parser.GlobalContext gctx,
            final IteratorHolder ih,
            final DenseStorageReader dsrAlt)
            throws CsvReaderException {
        final long phaseOneStart = ih.numConsumed() - 1;
        final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE);
        parser.tryParse(gctx, pctx, ih, phaseOneStart, Long.MAX_VALUE, true);
        if (!ih.isExhausted()) {
            // This parser couldn't make it to the end but there are others remaining to try. Signal a
            // failure to the
            // caller so that it can try the next one.
            return null;
        }
        if (phaseOneStart == 0) {
            // Reached end, and started at zero so everything was parsed and we are done.
            return pctx.sink();
        }

        final IteratorHolder ihAlt = new IteratorHolder(dsrAlt);
        ihAlt.tryMoveNext(); // Input is not empty, so we know this will succeed.
        final long end = parser.tryParse(gctx, pctx, ihAlt, 0, phaseOneStart, false);

        if (end == phaseOneStart) {
            return pctx.sink();
        }
        final String message =
                "Logic error: second parser phase failed on input. Parser was: "
                        + parser.getClass().getCanonicalName();
        throw new RuntimeException(message);
    }

    @NotNull
    private static  Sink onePhaseParse(
            final Parser parser, final Parser.GlobalContext gctx, final DenseStorageReader dsrAlt)
            throws CsvReaderException {
        final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE);
        final IteratorHolder ihAlt = new IteratorHolder(dsrAlt);
        ihAlt.tryMoveNext(); // Input is not empty, so we know this will succeed.
        parser.tryParse(gctx, pctx, ihAlt, 0, Long.MAX_VALUE, true);
        if (ihAlt.isExhausted()) {
            return pctx.sink();
        }
        final String message =
                "One phase parser failed on input. Parser was: " + parser.getClass().getCanonicalName();
        throw new CsvReaderException(message);
    }

    @NotNull
    private static  Sink emptyParse(
            final Parser parser, final Parser.GlobalContext gctx) throws CsvReaderException {
        // The parser won't do any "parsing" here, but it will create a Sink.
        final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE);
        parser.tryParse(gctx, pctx, null, 0, 0, true); // Result ignored.
        return pctx.sink();
    }

    @NotNull
    private static Sink unifyNumericResults(
            final Parser.GlobalContext gctx, final List wrappers) {
        if (wrappers.isEmpty()) {
            throw new RuntimeException("Logic error: no parser results.");
        }
        final ParserResultWrapper dest = wrappers.get(wrappers.size() - 1);

        // BTW, there's an edge case where there's only one parser in the list. In that case first ==
        // dest,
        // but this code still does the right thing.
        final ParserResultWrapper first = wrappers.get(0);
        fillNulls(gctx, dest.pctx, 0, first.begin);

        long destBegin = first.begin;
        for (int ii = 0; ii < wrappers.size() - 1; ++ii) {
            final ParserResultWrapper curr = wrappers.get(ii);
            copy(gctx, curr.pctx, dest.pctx, curr.begin, curr.end, destBegin);
            destBegin += (curr.end - curr.begin);
        }
        return dest.pctx.sink();
    }

    private static  void copy(
            final Parser.GlobalContext gctx,
            final Parser.ParserContext sourceCtx,
            final Parser.ParserContext destCtx,
            final long srcBegin,
            final long srcEnd,
            final long destBegin) {
        TypeConverter.copy(
                sourceCtx.source(),
                destCtx.sink(),
                srcBegin,
                srcEnd,
                destBegin,
                sourceCtx.valueChunk(),
                destCtx.valueChunk(),
                gctx.nullChunk());
    }

    private static  void fillNulls(
            final Parser.GlobalContext gctx,
            final Parser.ParserContext pctx,
            final long begin,
            final long end) {
        if (begin == end) {
            return;
        }
        final boolean[] nullBuffer = gctx.nullChunk();
        final Sink destSink = pctx.sink();
        final TARRAY values = pctx.valueChunk();

        final int sizeToInit = Math.min(nullBuffer.length, Math.toIntExact(end - begin));
        Arrays.fill(nullBuffer, 0, sizeToInit, true);

        for (long current = begin; current != end;) { // no ++
            final long endToUse = Math.min(current + nullBuffer.length, end);
            // Don't care about the actual values, only the null flag values (which are all true).
            destSink.write(values, nullBuffer, current, endToUse, false);
            current = endToUse;
        }
    }

    private static  List limitToSpecified(Collection items, Set limitTo) {
        final List result = new ArrayList<>();
        for (T item : items) {
            if (limitTo.contains(item)) {
                result.add(item);
            }
        }
        return result;
    }

    private static class CategorizedParsers {
        public static CategorizedParsers create(final Collection> parsers)
                throws CsvReaderException {
            Parser booleanParser = null;
            final Set> specifiedNumericParsers = new HashSet<>();
            // Subset of the above.
            final List> specifiedFloatingPointParsers = new ArrayList<>();
            Parser dateTimeParser = null;
            final Set> specifiedCharAndStringParsers = new HashSet<>();
            final List> specifiedTimeStampParsers = new ArrayList<>();
            final List> specifiedCustomParsers = new ArrayList<>();
            for (Parser p : parsers) {
                if (p == Parsers.BYTE || p == Parsers.SHORT || p == Parsers.INT || p == Parsers.LONG) {
                    specifiedNumericParsers.add(p);
                    continue;
                }

                if (p == Parsers.FLOAT_FAST || p == Parsers.FLOAT_STRICT || p == Parsers.DOUBLE) {
                    specifiedNumericParsers.add(p);
                    specifiedFloatingPointParsers.add(p);
                    continue;
                }

                if (p == Parsers.TIMESTAMP_SECONDS
                        || p == Parsers.TIMESTAMP_MILLIS
                        || p == Parsers.TIMESTAMP_MICROS
                        || p == Parsers.TIMESTAMP_NANOS) {
                    specifiedTimeStampParsers.add(p);
                    continue;
                }

                if (p == Parsers.CHAR || p == Parsers.STRING) {
                    specifiedCharAndStringParsers.add(p);
                    continue;
                }

                if (p == Parsers.BOOLEAN) {
                    booleanParser = p;
                    continue;
                }

                if (p == Parsers.DATETIME) {
                    dateTimeParser = p;
                    continue;
                }

                specifiedCustomParsers.add(p);
            }

            if (specifiedFloatingPointParsers.size() > 1) {
                throw new CsvReaderException(
                        "There is more than one floating point parser in the parser set.");
            }

            if (specifiedTimeStampParsers.size() > 1) {
                throw new CsvReaderException("There is more than one timestamp parser in the parser set.");
            }

            if (specifiedCustomParsers.size() > 1) {
                throw new CsvReaderException("There is more than one custom parser in the parser set.");
            }

            if (!specifiedCustomParsers.isEmpty() && parsers.size() != 1) {
                throw new CsvReaderException(
                        "When a custom parser is specified, it must be the only parser in the set.");
            }

            if (!specifiedNumericParsers.isEmpty() && !specifiedTimeStampParsers.isEmpty()) {
                throw new CsvReaderException(
                        "The parser set must not contain both numeric and timestamp parsers.");
            }

            final List> allNumericParsersByPrecedence =
                    List.of(
                            Parsers.BYTE,
                            Parsers.SHORT,
                            Parsers.INT,
                            Parsers.LONG,
                            Parsers.FLOAT_FAST,
                            Parsers.FLOAT_STRICT,
                            Parsers.DOUBLE);
            final List> allCharAndStringParsersByPrecedence =
                    List.of(Parsers.CHAR, Parsers.STRING);

            final List> numericParsers =
                    limitToSpecified(allNumericParsersByPrecedence, specifiedNumericParsers);
            final List> charAndStringParsers =
                    limitToSpecified(allCharAndStringParsersByPrecedence, specifiedCharAndStringParsers);
            final Parser timestampParser =
                    specifiedTimeStampParsers.isEmpty() ? null : specifiedTimeStampParsers.get(0);
            final Parser customParser =
                    specifiedCustomParsers.isEmpty() ? null : specifiedCustomParsers.get(0);

            return new CategorizedParsers(
                    booleanParser,
                    numericParsers,
                    dateTimeParser,
                    charAndStringParsers,
                    timestampParser,
                    customParser);
        }

        private final Parser booleanParser;
        private final List> numericParsers;
        private final Parser dateTimeParser;
        private final List> charAndStringParsers;
        private final Parser timestampParser;
        private final Parser customParser;

        private CategorizedParsers(
                Parser booleanParser,
                List> numericParsers,
                Parser dateTimeParser,
                List> charAndStringParsers,
                Parser timestampParser,
                Parser customParser) {
            this.booleanParser = booleanParser;
            this.numericParsers = numericParsers;
            this.dateTimeParser = dateTimeParser;
            this.charAndStringParsers = charAndStringParsers;
            this.timestampParser = timestampParser;
            this.customParser = customParser;
        }
    }

    private static class ParserResultWrapper {
        private final Parser.ParserContext pctx;
        private final long begin;
        private final long end;

        public ParserResultWrapper(Parser.ParserContext pctx, long begin, long end) {
            this.pctx = pctx;
            this.begin = begin;
            this.end = end;
        }
    }
}