io.questdb.log.LogRecordUtf8Sink Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 *     ___                  _   ____  ____
 *    / _ \ _   _  ___  ___| |_|  _ \| __ )
 *   | | | | | | |/ _ \/ __| __| | | |  _ \
 *   | |_| | |_| |  __/\__ \ |_| |_| | |_) |
 *    \__\_\\__,_|\___||___/\__|____/|____/
 *
 *  Copyright (c) 2014-2019 Appsicle
 *  Copyright (c) 2019-2024 QuestDB
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 ******************************************************************************/

package io.questdb.log;

import io.questdb.std.Misc;
import io.questdb.std.Mutable;
import io.questdb.std.Unsafe;
import io.questdb.std.str.*;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

public class LogRecordUtf8Sink implements Utf8Sink, DirectUtf8Sequence, Sinkable, Mutable {
    public static final int EOL_LENGTH = Misc.EOL.length();
    private final static int UTF8_BYTE_CLASS_BAD = -1;
    private final static int UTF8_BYTE_CLASS_CONTINUATION = 0;
    protected final long address;
    protected final long lim;
    private final AsciiCharSequence asciiCharSequence = new AsciiCharSequence();
    protected long _wptr;
    private boolean done = false;
    private int level;

    public LogRecordUtf8Sink(long address, long addressSize) {
        this.address = _wptr = address;
        this.lim = address + addressSize;
    }

    @Override
    public @NotNull CharSequence asAsciiCharSequence() {
        return asciiCharSequence.of(this);
    }

    @Override
    public byte byteAt(int index) {
        return Unsafe.getUnsafe().getByte(address + index);
    }

    @Override
    public void clear() {
        _wptr = address;
        done = false;
    }

    public int getLevel() {
        return level;
    }

    @Override
    public long ptr() {
        return address;
    }

    @Override
    public Utf8Sink put(@Nullable Utf8Sequence us) {
        if (us != null) {
            final int rem = (int) (lim - _wptr - EOL_LENGTH);
            final int size = us.size();
            if (rem >= size) {
                // Common case where the buffer fits the available space.
                Utf8s.strCpy(us, size, _wptr);
                _wptr += size;
                return this;
            }

            // The line is being truncated:
            // We determine a safe length to byte-copy.
            // We skip copying the last 4 bytes, as they may be a multibyte UTF-8 codepoint.
            // NOTE: The computed length may be negative.
            int safeLen = rem - 4;
            if (safeLen > 0) {
                Utf8s.strCpy(us, safeLen, _wptr);
                _wptr += safeLen;
            }

            safeLen = Math.max(0, safeLen);
            for (int i = safeLen; i < rem; i++) {
                // Copying the final few bytes one at a time ensures we don't write any partial codepoints.
                put(us.byteAt(i));
            }
            return this;
        }
        return this;
    }

    @Override
    public Utf8Sink put(byte b) {
        final long left = lim - _wptr - EOL_LENGTH;
        if (left >= 4) { // 4 is the maximum byte length for a UTF-8 character.
            Unsafe.getUnsafe().putByte(_wptr++, b);
            return this;
        }

        // We're now down to the last few bytes of the line.
        // As such we need to be careful not to write a partial UTF-8 character.

        if (done) {
            // If we've detected a character that is too long for the buffer,
            // then we need to stop processing any later characters.
            // In other words, we want to truncate the log line, not skip over the characters that don't fit.
            //
            // Take the following string:
            //     >>> "I'd like some apple π!".encode('utf-8')
            //     b"I'd like some apple \xcf\x80!"
            //     >>> len(_)
            //     23
            //
            // Encoded, it's 23 bytes. Let's assume that the buffer is only 22 bytes.
            // Without the `done` flag it would serialize out as: "I'd like some apple !"
            // incorrectly writing the '!' byte as there would have been enough space for it.
            return this;
        }

        long needed = utf8CharNeeded(b);
        if (needed == UTF8_BYTE_CLASS_BAD) {
            // Invalid UTF-8 byte, sentinel replacement -- this should never happen in practice.
            b = (byte) '?';
            needed = 1;
        }

        if (left >= needed) {
            Unsafe.getUnsafe().putByte(_wptr++, b);
        } else {
            done = true;
        }

        return this;
    }

    @Override
    public Utf8Sink putEOL() {
        int rem = (int) (lim - _wptr);
        int len = Misc.EOL.length();
        int n = Math.min(rem, len);
        Utf8s.strCpyAscii(Misc.EOL, n, _wptr);
        _wptr += n;
        return this;
    }

    @Override
    public Utf8Sink putNonAscii(long lo, long hi) {
        final long rem = (lim - _wptr - EOL_LENGTH);
        final long size = hi - lo;
        if (rem >= size) {
            // Common case where the buffer fits the available space.
            Unsafe.getUnsafe().copyMemory(lo, _wptr, size);
            _wptr += size;
            return this;
        }

        // The line is being truncated:
        // We determine a safe length to byte-copy.
        // We skip copying the last 4 bytes, as they may be a multibyte UTF-8 codepoint.
        // NOTE: The computed length may be negative.
        long safeLen = rem - 4;
        if (safeLen > 0) {
            Unsafe.getUnsafe().copyMemory(lo, _wptr, safeLen);
            _wptr += safeLen;
        }

        safeLen = Math.max(0, safeLen);
        for (long i = safeLen; i < rem; i++) {
            // Copying the final few bytes one at a time ensures we don't write any partial codepoints.
            put(Unsafe.getUnsafe().getByte(lo + i));
        }
        return this;
    }

    public void setLevel(int level) {
        this.level = level;
    }

    @Override
    public int size() {
        return (int) (_wptr - address);
    }

    @Override
    public void toSink(@NotNull CharSink sink) {
        switch (sink.getEncoding()) {
            case CharSinkEncoding.UTF8:
                sink.putNonAscii(address, _wptr);
                break;
            case CharSinkEncoding.UTF16:
                Utf8s.utf8ToUtf16(address, _wptr, (Utf16Sink) sink);
                break;
            default:
                assert false : "unsupported sink encoding";
                break;
        }
    }

    @Override
    public @NotNull String toString() {
        return Utf8s.stringFromUtf8Bytes(address, _wptr);
    }

    private static int utf8ByteClass(byte b) {
        // Reference the table at:
        // https://en.wikipedia.org/wiki/UTF-8#Encoding
        if (b >= 0) {
            // ASCII
            return 1;
        } else if ((b & 0xC0) == 0x80) {
            // 0xC0 = 1100 0000, 0x80 = 1000 0000, check if starts with 10
            return UTF8_BYTE_CLASS_CONTINUATION;
        } else if ((b & 0xE0) == 0xC0) {
            // 0xE0 = 1110 0000, 0xC0 = 1100 0000, check if starts with 110
            return 2;
        } else if ((b & 0xF0) == 0xE0) {
            // 0xF0 = 1111 0000, 0xE0 = 1110 0000m, check if starts with 1110
            return 3;
        } else if ((b & 0xF8) == 0xF0) {
            // 0xF8 = 1111 1000, 0xF0 = 1111 0000, check if starts with 1111 0
            return 4;
        } else {
            return UTF8_BYTE_CLASS_BAD;
        }
    }

    private int utf8CharNeeded(byte b) {
        final int byteClass = utf8ByteClass(b);
        switch (byteClass) {
            case UTF8_BYTE_CLASS_BAD:
                return UTF8_BYTE_CLASS_BAD;

            case UTF8_BYTE_CLASS_CONTINUATION: {
                // We've been dropped into the middle of a multibyte character
                // without prior knowledge of how long it is.
                // We now need to look back to find the start of the character.
                int multibyteLength = UTF8_BYTE_CLASS_BAD;
                long ptr = _wptr - 1;
                final long boundary = Math.max(address, _wptr - 4);

                lookback:
                for (; ptr >= boundary; --ptr) {
                    final byte prev = Unsafe.getUnsafe().getByte(ptr);
                    multibyteLength = utf8ByteClass(prev);
                    switch (multibyteLength) {
                        case UTF8_BYTE_CLASS_BAD:
                            return UTF8_BYTE_CLASS_BAD;
                        case UTF8_BYTE_CLASS_CONTINUATION:
                            continue;
                        default:
                            break lookback;
                    }
                }
                // Adjust the obtained length to account for the number of bytes looked back.
                multibyteLength -= (int) (_wptr - ptr);
                // Normalize errors in case of an illegal ascii character followed by one or more continuation bytes.
                if (multibyteLength < 1) {
                    multibyteLength = UTF8_BYTE_CLASS_BAD;
                }
                return multibyteLength;
            }

            default:
                return byteClass;
        }
    }
}