io.deephaven.csv.CsvSpecs Maven / Gradle / Ivy

Go to download
package io.deephaven.csv;

import io.deephaven.csv.annotations.BuildableStyle;
import io.deephaven.csv.parsers.Parser;
import io.deephaven.csv.parsers.Parsers;
import io.deephaven.csv.tokenization.Tokenizer;
import org.immutables.value.Value.Default;
import org.immutables.value.Value.Immutable;
import org.jetbrains.annotations.Nullable;

import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;

/**
 * A specification object for parsing CSV input.
 */
@Immutable
@BuildableStyle
public abstract class CsvSpecs {

    public interface Builder {
        /**
         * Copy all of the parameters from {@code specs} into {@code this} builder.
         */
        Builder from(CsvSpecs specs);

        /**
         * Client-specified headers that can be used to override the existing headers in the input (if
         * {@link #hasHeaderRow()} is true), or to provide absent headers (if {@link #hasHeaderRow()} is false).
         */
        Builder headers(Iterable elements);

        /**
         * Override a specific column header by number. This is applied after {@link #headers()}. Column numbers start
         * with 1.
         */
        Builder putHeaderForIndex(int index, String header);

        /**
         * The parsers that the user wants to participate in type inference. Note that the order that the parsers in
         * this list matters only for custom parsers. In particular:
         * 
         * Standard system parsers (singletons from the {@link Parsers} class) will run in their standard precedence
         * order, regardless of the order they appear here.
         * All specified system parsers will be run before any specified custom parsers.
         * Custom parsers will be run in the order they are specified here.
         * 
         *
         * @return the parsers
         */
        Builder parsers(Iterable> elements);

        /**
         * Used to force a specific parser for a specific column, specified by column name. Specifying a parser forgoes
         * column inference for that column.
         */
        Builder putParserForName(String columnName, Parser parser);

        /**
         * Used to force a specific parser for a specific column, specified by column number. Column numbers start with
         * 1. Specifying a parser forgoes column inference for that column.
         */
        Builder putParserForIndex(int index, Parser parser);

        /**
         * The default string that means "null value" in the input. This default is used for a column if there is no
         * corresponding {@link #nullValueLiteralForName()} or {@link #nullValueLiteralForName()} specified for that
         * column.
         */
        Builder nullValueLiteral(String nullValueLiteral);

        /**
         * The null value literal for specific columns, specified by column name. Specifying a null value literal for a
         * column overrides the value in {@link #nullValueLiteral()}.
         */
        Builder putNullValueLiteralForName(String columnName, String nullValueLiteral);

        /**
         * The null value literal for specific columns, specified by 1-based column index. Specifying a null value
         * literal for a column overrides the value in {@link #nullValueLiteral()}.
         */
        Builder putNullValueLiteralForIndex(int index, String nullValueLiteral);

        /**
         * The parser to uses when all values in the column are null. Defaults to {@code Parsers#STRING}.
         */
        Builder nullParser(Parser parser);

        /**
         * An optional low-level parser that understands custom time zones.
         */
        Builder customTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser);

        /**
         * An optional legalizer for column headers. The legalizer is a function that takes column names (as a
         * {@code String[]}) names and returns legal column names (as a {@code String[]}). The legalizer function is
         * permitted to reuse its input data structure. Defaults to {@code Function#identity()}.
         */
        Builder headerLegalizer(Function headerLegalizer);

        /**
         * An optional validator for column headers. The validator is a {@link Predicate} function that takes a column
         * name and returns a true if it is a legal column name, false otherwise. Defaults to {@code c -> true}.
         */
        Builder headerValidator(Predicate headerValidator);

        /**
         * An optional low-level parser that understands custom time zones.
         */
        Builder hasHeaderRow(boolean hasHeaderRow);

        /**
         * The field delimiter character (the character that separates one column from the next). Must be 7-bit ASCII.
         * Defaults to {code ','}.
         */
        Builder delimiter(char delimiter);

        /**
         * The quote character (used when you want field or line delimiters to be interpreted as literal text. Must be
         * 7-bit ASCII. Defaults to {@code '"'}. For example:
         *
         *          * 123,"hello, there",456,
         * 
         *
         * Would be read as the three fields:
         *
         * 
         * 123
         * 
hello, there
         * 
456
         * 
         */
        Builder quote(char quote);

        /**
         * Whether to trim leading and trailing blanks from non-quoted values. Defaults to {@code true}.
         */
        Builder ignoreSurroundingSpaces(boolean ignoreSurroundingSpaces);

        /**
         * Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}.
         */
        Builder trim(boolean trim);

        /**
         * Whether to run concurrently. In particular, the operation that reads the raw file, breaks it into columns,
         * and stores that column text in memory can run in parallel with the column parsers, and the parsers can run in
         * parallel with each other.
         */
        Builder concurrent(boolean async);

        CsvSpecs build();
    }

    /**
     * Creates a builder for {@link CsvSpecs}.
     */
    public static Builder builder() {
        return ImmutableCsvSpecs.builder();
    }

    /**
     * A comma-separated-value delimited format.
     */
    public static CsvSpecs csv() {
        return builder().build();
    }

    /**
     * A tab-separated-value delimited format. Equivalent to {@code builder().delimiter('\t').build()}.
     */
    public static CsvSpecs tsv() {
        return builder().delimiter('\t').build();
    }

    /**
     * A header-less, CSV format. Equivalent to {@code builder().hasHeaderRow(false).build()}.
     */
    public static CsvSpecs headerless() {
        return builder().hasHeaderRow(false).build();
    }

    /**
     * See {@link Builder#headers}.
     */
    public abstract List headers();

    /**
     * See {@link Builder#putHeaderForIndex}
     */
    public abstract Map headerForIndex();

    /**
     * See {@link Builder#parsers}.
     */
    @Default
    public List> parsers() {
        return Parsers.DEFAULT;
    }

    /**
     * See {@link Builder#putParserForName}.
     */
    public abstract Map> parserForName();

    /**
     * See {@link Builder#putParserForIndex}.
     */
    public abstract Map> parserForIndex();

    /**
     * See {@link Builder#nullValueLiteral}.
     */
    @Default
    public String nullValueLiteral() {
        return "";
    }

    /**
     * See {@link Builder#nullValueLiteral}.
     */
    public abstract Map nullValueLiteralForName();

    /**
     * See {@link Builder#putNullValueLiteralForIndex}.
     */
    public abstract Map nullValueLiteralForIndex();

    /**
     * See {@link Builder#nullParser}.
     */
    @Default
    @Nullable
    public Parser nullParser() {
        return Parsers.STRING;
    }

    /**
     * See {@link Builder#customTimeZoneParser}.
     */
    @Default
    @Nullable
    public Tokenizer.CustomTimeZoneParser customTimeZoneParser() {
        return null;
    }

    /**
     * See {@link Builder#headerLegalizer}.
     */
    @Default
    public Function headerLegalizer() {
        return Function.identity();
    }

    /**
     * See {@link Builder#headerValidator}.
     */
    @Default
    public Predicate headerValidator() {
        return c -> true;
    }

    /**
     * See {@link Builder#hasHeaderRow}.
     */
    @Default
    public boolean hasHeaderRow() {
        return true;
    }

    /**
     * See {@link Builder#delimiter}.
     */
    @Default
    public char delimiter() {
        return ',';
    }

    /**
     * See {@link Builder#quote}.
     */
    @Default
    public char quote() {
        return '"';
    }

    /**
     * See {@link Builder#ignoreSurroundingSpaces}.
     */
    @Default
    public boolean ignoreSurroundingSpaces() {
        return true;
    }

    /**
     * See {@link Builder#trim}.
     */
    @Default
    public boolean trim() {
        return false;
    }

    /**
     * See {@link Builder#concurrent}.
     */
    @Default
    public boolean concurrent() {
        return true;
    }
}