org.finos.tracdap.common.codec.json.JacksonValues Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tracdap-lib-data Show documentation
TRAC D.A.P. data library, interfaces and core functionality for working with primary data
There is a newer version: 0.7.0
/*
 * Copyright 2022 Accenture Global Solutions Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.finos.tracdap.common.codec.json;


import org.finos.tracdap.common.exception.EDataTypeNotSupported;
import org.finos.tracdap.common.exception.EUnexpected;
import org.finos.tracdap.common.metadata.MetadataCodec;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.dataformat.csv.CsvGenerator;

import org.apache.arrow.vector.types.Types;
import org.apache.arrow.vector.*;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.nio.charset.StandardCharsets;
import java.time.*;
import java.time.format.DateTimeParseException;
import java.util.List;

import static com.fasterxml.jackson.core.JsonToken.VALUE_NUMBER_INT;


public class JacksonValues {

    private static final Logger log = LoggerFactory.getLogger(JacksonValues.class);

    // Standard NaN / infinity values are what gets quoted when encoding data
    // For consistency, these values match the output of the Apache Arrow CSV implementation
    private static final String STANDARD_NAN = "nan";
    private static final String STANDARD_POSITIVE_INFINITY = "inf";
    private static final String STANDARD_NEGATIVE_INFINITY = "-inf";

    // Values that are recognised as NaN / infinity during decoding
    private static final List NAN_VALUES = List.of("nan", "na");
    private static final List INFINITY_VALUES = List.of("inf", "infinity");


    public static void parseAndSet(
            FieldVector vector, int row,
            JsonParser parser, JsonToken token)
            throws IOException {

        var field = vector.getField();

        // Make sure not to write nulls into non-nullable fields
        // At the codec level this is a fatal error; we can't generate invalid Arrow data
        // Higher up the stack there can be options to filter / exclude

        int isSet = (token == JsonToken.VALUE_NULL) ? 0 : 1;
        boolean isNullable = field.isNullable();

        if (isSet == 0 && !isNullable) {

            var msg = "Parsing failed: Got a null value for not-null field [" + vector.getField().getName() + "]";
            throw new JsonParseException(parser, msg, parser.currentLocation());
        }


        var minorType = vector.getMinorType();

        switch (minorType) {

            case BIT:

                BitVector boolVec = (BitVector) vector;
                int boolVal;

                if (token == JsonToken.VALUE_NULL)
                    boolVal = 0;
                else if (token == JsonToken.VALUE_TRUE)
                    boolVal = 1;
                else if (token == JsonToken.VALUE_FALSE)
                    boolVal = 0;
                else if (token == JsonToken.VALUE_STRING) {
                    String boolStr = parser.getValueAsString();
                    boolVal = Boolean.parseBoolean(boolStr) ? 1 : 0;
                }
                else if (token == VALUE_NUMBER_INT) {
                    boolVal = parser.getIntValue();
                    if (boolVal != 0 && boolVal != 1)
                        throw new JsonParseException(parser, "Invalid boolean value", parser.currentLocation());
                }
                else
                    throw new JsonParseException(parser, "Invalid boolean value", parser.currentLocation());

                boolVec.set(row, isSet, boolVal);

                break;

            case BIGINT:

                BigIntVector int64Vec = (BigIntVector) vector;
                long int64Val = isSet != 0 ? parser.getLongValue() : 0;

                int64Vec.set(row, isSet, int64Val);

                break;

            case INT:

                IntVector int32Vec = (IntVector) vector;
                int int32Val = isSet != 0 ? parser.getIntValue() : 0;

                int32Vec.set(row, isSet, int32Val);

                break;

            case SMALLINT:

                SmallIntVector int16Vec = (SmallIntVector) vector;
                short int16Val = isSet != 0 ? parser.getShortValue() : 0;

                int16Vec.set(row, isSet, int16Val);

                break;

            case TINYINT:

                TinyIntVector int8Vec = (TinyIntVector) vector;
                byte int8Val = isSet != 0 ? parser.getByteValue() : 0;

                int8Vec.set(row, isSet, int8Val);

                break;

            case FLOAT8:

                Float8Vector doubleVec = (Float8Vector) vector;
                double doubleVal = isSet != 0 ? parseFloat8(parser, token) : 0;

                doubleVec.set(row, isSet, doubleVal);

                break;

            case FLOAT4:

                Float4Vector floatVec = (Float4Vector) vector;
                float floatVal = isSet != 0 ? parseFloat4(parser, token) : 0;

                floatVec.set(row, isSet, floatVal);

                break;

            case DECIMAL:

                DecimalVector decimal128Vec = (DecimalVector) vector;

                if (isSet == 0)
                    decimal128Vec.setNull(row);

                else {
                    BigDecimal decimal128Val = parseBigDecimal(parser, token, decimal128Vec.getScale());
                    decimal128Vec.set(row, decimal128Val);
                }

                break;

            case DECIMAL256:

                Decimal256Vector decimal256Vec = (Decimal256Vector) vector;

                if (isSet == 0)
                    decimal256Vec.setNull(row);

                else {
                    BigDecimal decimal256Val = parseBigDecimal(parser, token, decimal256Vec.getScale());
                    decimal256Vec.set(row, decimal256Val);
                }

                break;

            case VARCHAR:

                VarCharVector varcharVec = (VarCharVector) vector;

                if (isSet == 0)
                    varcharVec.setNull(row);

                else {
                    String varcharVal = parser.getValueAsString();

                    // For variable width vectors, the required size of the content buffer is not known up front
                    // Arrow makes an initial guess, but sometimes it will need to reallocate on write
                    // So, we need to call setSafe() instead of set(), to avoid a buffer overflow

                    varcharVec.setSafe(row, varcharVal.getBytes(StandardCharsets.UTF_8));
                }

                break;

            case DATEDAY:

                DateDayVector dateVec = (DateDayVector) vector;

                if (isSet == 0)
                    dateVec.setNull(row);

                else {

                    LocalDate dateVal = parseLocalDate(parser);
                    int unixEpochDay = (int) dateVal.toEpochDay();

                    dateVec.set(row, isSet, unixEpochDay);
                }

                break;

            case TIMESTAMPMILLI:

                TimeStampMilliVector timeStampMVec = (TimeStampMilliVector) vector;

                if (isSet == 0)
                    timeStampMVec.setNull(row);

                else {

                    LocalDateTime datetimeNoZone = parseDatetimeNoZone(parser);
                    long unixEpochMillis =
                            (datetimeNoZone.toEpochSecond(ZoneOffset.UTC) * 1000) +
                            (datetimeNoZone.getNano() / 1000000);

                    timeStampMVec.set(row, unixEpochMillis);
                }

                break;

                // For handling TZ type:
                // 1. ArrowType.Timestamp mtzType = (ArrowType.Timestamp) field.getType();
                // 2. ZoneOffset mtzOffset = ZoneOffset.of(mtzType.getTimezone());

            default:

                // This error does not relate to the data, only to the target column type
                // So, do not include parse location in the error message

                var err = String.format(
                        "Data type not supported for field: [%s] %s (%s)",
                        field.getName(), field.getType(), vector.getMinorType());

                log.error(err);
                throw new EDataTypeNotSupported(err);
        }
    }

    public static void setEmptyString(FieldVector vector, int row) {

        var minorType = vector.getMinorType();

        if (minorType == Types.MinorType.VARCHAR) {

            VarCharVector varcharVec = (VarCharVector) vector;
            varcharVec.setSafe(row, "".getBytes(StandardCharsets.UTF_8));
        }
        else {

                throw new EUnexpected();
        }
    }

    static private float parseFloat4(JsonParser parser, JsonToken token) throws IOException {

        try {

            if (parser.currentToken() == JsonToken.VALUE_NUMBER_INT ||
                parser.currentToken() == JsonToken.VALUE_NUMBER_FLOAT) {

                return parser.getFloatValue();
            }
            else if (parser.currentToken() == JsonToken.VALUE_STRING) {

                // Jackson does not recognise the default infinity value encoded by Arrow, which is "inf"
                // This logic allows for special values encoded using a variety of standard words

                var lowerToken = parser.getValueAsString().toLowerCase();

                if (NAN_VALUES.contains(lowerToken))
                    return Float.NaN;

                if (INFINITY_VALUES.contains(lowerToken))
                    return Float.POSITIVE_INFINITY;

                if (lowerToken.startsWith("-") && INFINITY_VALUES.contains(lowerToken.substring(1)))
                    return Float.NEGATIVE_INFINITY;

                return Float.parseFloat(parser.getValueAsString());
            }
            else {

                var msg = "Parsing failed: Excepted a floating point value, got [" + token.name() + "]";
                throw new JsonParseException(parser, msg, parser.currentLocation());
            }
        }
        catch (NumberFormatException e) {

            throw new JsonParseException(parser, e.getMessage(), parser.currentLocation(), e);
        }
    }

    static private double parseFloat8(JsonParser parser, JsonToken token) throws IOException {

        try {

            if (parser.currentToken() == JsonToken.VALUE_NUMBER_INT ||
                parser.currentToken() == JsonToken.VALUE_NUMBER_FLOAT) {

                return parser.getDoubleValue();
            }
            else if (parser.currentToken() == JsonToken.VALUE_STRING) {

                // Jackson does not recognise the default infinity value encoded by Arrow, which is "inf"
                // This logic allows for special values encoded using a variety of standard words

                var lowerToken = parser.getValueAsString().toLowerCase();

                if (NAN_VALUES.contains(lowerToken))
                    return Double.NaN;

                if (INFINITY_VALUES.contains(lowerToken))
                    return Double.POSITIVE_INFINITY;

                if (lowerToken.startsWith("-") && INFINITY_VALUES.contains(lowerToken.substring(1)))
                    return Double.NEGATIVE_INFINITY;

                return Double.parseDouble(parser.getValueAsString());
            }
            else {

                var msg = "Parsing failed: Excepted a floating point value, got [" + token.name() + "]";
                throw new JsonParseException(parser, msg, parser.currentLocation());
            }
        }
        catch (NumberFormatException e) {

            throw new JsonParseException(parser, e.getMessage(), parser.currentLocation(), e);
        }
    }

    static private BigDecimal parseBigDecimal(JsonParser parser, JsonToken token, int scale) throws IOException {

        try {

            BigDecimal decimalVal;

            if (token == VALUE_NUMBER_INT || token == JsonToken.VALUE_NUMBER_FLOAT)
                decimalVal = parser.getDecimalValue();
            else if (token == JsonToken.VALUE_STRING)
                decimalVal = new BigDecimal(parser.getValueAsString());
            else {
                var msg = "Parsing failed: Excepted a decimal, got [" + token.name() + "]";
                throw new JsonParseException(parser, msg, parser.currentLocation());
            }


            if (decimalVal.scale() == scale)
                return decimalVal;

            else
                // Scale the decimal to match the scale of the arrow vector
                return decimalVal.setScale(scale, RoundingMode.UNNECESSARY);
        }
        catch (NumberFormatException e) {
            throw new JsonParseException(parser, e.getMessage(), parser.currentLocation(), e);
        }
    }

    static private LocalDate parseLocalDate(JsonParser parser) throws IOException {

        try {
            String dateStr = parser.getValueAsString();
            return LocalDate.parse(dateStr, MetadataCodec.ISO_DATE_FORMAT);
        }
        catch (DateTimeParseException e) {
            throw new JsonParseException(parser, e.getMessage(), parser.currentLocation(), e);
        }
    }

    static private LocalDateTime parseDatetimeNoZone(JsonParser parser) throws IOException {

        try {
            String datetimeStr = parser.getValueAsString();
            return LocalDateTime.parse(datetimeStr, MetadataCodec.ISO_DATETIME_INPUT_NO_ZONE_FORMAT);
        }
        catch (DateTimeParseException e) {
            throw new JsonParseException(parser, e.getMessage(), parser.currentLocation(), e);
        }
    }

    public static void getAndGenerate(FieldVector vector, int row, JsonGenerator generator) throws IOException {

        boolean isNull = vector.isNull(row);

        if (isNull) {
            generator.writeNull();
            return;
        }

        var minorType = vector.getMinorType();

        switch (minorType) {

            case BIT:

                BitVector boolVec = (BitVector) vector;
                int boolVal = boolVec.get(row);

                generator.writeBoolean(boolVal != 0);

                break;

            case BIGINT:

                BigIntVector int64Vec = (BigIntVector) vector;
                long int64Val = int64Vec.get(row);

                generator.writeNumber(int64Val);

                break;

            case INT:

                IntVector int32Vec = (IntVector) vector;
                int int32Val = int32Vec.get(row);

                generator.writeNumber(int32Val);

                break;

            case SMALLINT:

                SmallIntVector int16Vec = (SmallIntVector) vector;
                short int16Val = int16Vec.get(row);

                generator.writeNumber(int16Val);

                break;

            case TINYINT:

                TinyIntVector int8Vec = (TinyIntVector) vector;
                byte int8Val = int8Vec.get(row);

                generator.writeNumber(int8Val);

                break;

            case FLOAT8:

                Float8Vector doubleVec = (Float8Vector) vector;
                double doubleVal = doubleVec.get(row);

                // Output NaN and infinities using the same standard format as the Apache Arrow CSV implementation

                if (Double.isNaN(doubleVal))
                    quoteNanAsString(generator, STANDARD_NAN);
                else if (Double.isInfinite(doubleVal)) {
                    if (doubleVal > 0)
                        quoteNanAsString(generator, STANDARD_POSITIVE_INFINITY);
                    else
                        quoteNanAsString(generator, STANDARD_NEGATIVE_INFINITY);
                }
                else
                    generator.writeNumber(doubleVal);

                break;

            case FLOAT4:

                Float4Vector floatVec = (Float4Vector) vector;
                float floatVal = floatVec.get(row);

                // Output NaN and infinities using the same standard format as the Apache Arrow CSV implementation

                if (Float.isNaN(floatVal))
                    quoteNanAsString(generator, STANDARD_NAN);
                else if (Float.isInfinite(floatVal)) {
                    if (floatVal > 0)
                        quoteNanAsString(generator, STANDARD_POSITIVE_INFINITY);
                    else
                        quoteNanAsString(generator, STANDARD_NEGATIVE_INFINITY);
                }
                else
                    generator.writeNumber(floatVal);

                break;

            case DECIMAL:

                DecimalVector decimal128Vec = (DecimalVector) vector;
                BigDecimal decimal128Val = decimal128Vec.getObject(row);

                // This will render zeroes as "0" when the scale is large, preferable to 0e-12
                // For small scales use the default rendering, particularly currency with scale == 2

                if (decimal128Vec.getScale() > 3 && BigDecimal.ZERO.compareTo(decimal128Val) == 0)
                    generator.writeString(BigDecimal.ZERO.toString());
                else
                    generator.writeString(decimal128Val.toString());

                break;

            case DECIMAL256:

                Decimal256Vector decimal256Vec = (Decimal256Vector) vector;
                BigDecimal decimal256Val = decimal256Vec.getObject(row);

                if (decimal256Vec.getScale() > 3 && BigDecimal.ZERO.compareTo(decimal256Val) == 0)
                    generator.writeString(BigDecimal.ZERO.toString());
                else
                    generator.writeString(decimal256Val.toString());

                break;

            case VARCHAR:

                VarCharVector varcharVec = (VarCharVector) vector;
                String varcharVal = new String(varcharVec.get(row), StandardCharsets.UTF_8);

                generator.writeString(varcharVal);

                break;

            case DATEDAY:

                DateDayVector dateVec = (DateDayVector) vector;
                int unixEpochDay = dateVec.get(row);
                LocalDate dateVal = LocalDate.ofEpochDay(unixEpochDay);
                String dateStr = dateVal.format(MetadataCodec.ISO_DATE_FORMAT);

                generator.writeString(dateStr);

                break;

            case TIMESTAMPMILLI:

                TimeStampMilliVector timeStampMVec = (TimeStampMilliVector) vector;

                long epochMillis = timeStampMVec.get(row);
                long epochSeconds = epochMillis / 1000;
                int nanos = (int) (epochMillis % 1000) * 1000000;

                if (epochSeconds < 0 && nanos != 0) {
                    --epochSeconds;
                    nanos = nanos + 1000000000;
                }

                LocalDateTime localDatetimeVal = LocalDateTime.ofEpochSecond(epochSeconds, nanos, ZoneOffset.UTC);
                OffsetDateTime offsetDatetimeVal = localDatetimeVal.atOffset(ZoneOffset.UTC);
                String datetimeStr = MetadataCodec.ISO_DATETIME_NO_ZONE_FORMAT.format(offsetDatetimeVal);

                generator.writeString(datetimeStr);

                break;

            // For handling TZ type:
            // 1. ArrowType.Timestamp mtzType = (ArrowType.Timestamp) field.getType();
            // 2. ZoneOffset mtzOffset = ZoneOffset.of(mtzType.getTimezone());

            default:

                // This error does not relate to the data, only to the target column type
                // So, do not include parse location in the error message

                var field = vector.getField();

                var err = String.format(
                        "Data type not supported for field: [%s] %s (%s)",
                        field.getName(), field.getType(), vector.getMinorType());

                log.error(err);
                throw new EDataTypeNotSupported(err);
        }
    }

    private static void quoteNanAsString(JsonGenerator generator, String nanValue) throws IOException {

        // Special handling for output of NaN values (NaN, +Infinity, -Infinity)

        // In JSON, NaN values are not valid numbers so need to be quoted as strings

        // In CSV NaN also looks like a string
        // The Apache Arrow CSV implementation only works with quoted strings for NaN
        // So, switch on quoting for NaN fields to match the Arrow implementation
        // This allows the runtime to use the Arrow (C++) CSV parser instead of the lenient (Python) fallback

        CsvGenerator csvGenerator = generator instanceof CsvGenerator
            ? (CsvGenerator) generator
            : null;

        boolean switchQuoting = ! CsvGenerator.Feature
                .ALWAYS_QUOTE_STRINGS
                .enabledIn(generator.getFormatFeatures());

        if (csvGenerator != null && switchQuoting) {
            csvGenerator.enable(CsvGenerator.Feature.ALWAYS_QUOTE_STRINGS);
        }

        generator.writeString(nanValue);

        if (csvGenerator != null && switchQuoting) {
            csvGenerator.disable(CsvGenerator.Feature.ALWAYS_QUOTE_STRINGS);
        }
    }
}