All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.jackson.dataformat.smile.SmileParser Maven / Gradle / Ivy

Go to download

Support for reading and writing Smile ("binary JSON") encoded data using Jackson abstractions (streaming API, data binding, tree model)

The newest version!
package com.fasterxml.jackson.dataformat.smile;

import java.io.*;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
import com.fasterxml.jackson.core.util.ByteArrayBuilder;

import static com.fasterxml.jackson.dataformat.smile.SmileConstants.BYTE_MARKER_END_OF_STRING;

public class SmileParser extends SmileParserBase
{
    /**
     * Enumeration that defines all togglable features for Smile generators.
     */
    public enum Feature implements FormatFeature
    {
        /**
         * Feature that determines whether 4-byte Smile header is mandatory in input,
         * or optional. If enabled, it means that only input that starts with the header
         * is accepted as valid; if disabled, header is optional. In latter case,
         * settings for content are assumed to be defaults.
         */
        REQUIRE_HEADER(true)
        ;

        final boolean _defaultState;
        final int _mask;

        /**
         * Method that calculates bit set (flags) of all features that
         * are enabled by default.
         */
        public static int collectDefaults()
        {
            int flags = 0;
            for (Feature f : values()) {
                if (f.enabledByDefault()) {
                    flags |= f.getMask();
                }
            }
            return flags;
        }

        private Feature(boolean defaultState) {
            _defaultState = defaultState;
            _mask = (1 << ordinal());
        }

        @Override public boolean enabledByDefault() { return _defaultState; }
        @Override public int getMask() { return _mask; }
        @Override public boolean enabledIn(int flags) { return (flags & getMask()) != 0; }
    }

    /**
     * Flag to indicate if the JDK version is 11 or later. This can be used in some methods
     * to choose more optimal behavior. In particular, jdk9+ have different internals for
     * the String class.
     *
     * @since 2.14.1
     */
    private static final boolean JDK11_OR_LATER;
    static {
        boolean recentJdk;
        try {
            // The strip method was added in jdk11, so use it to detect a newer version
            String.class.getMethod("strip");
            recentJdk = true;
        } catch (Exception e) {
            recentJdk = false;
        }
        JDK11_OR_LATER = recentJdk;
    }

    /*
    /**********************************************************
    /* Configuration
    /**********************************************************
     */

    /**
     * Codec used for data binding when (if) requested.
     */
    protected ObjectCodec _objectCodec;

    /*
    /**********************************************************
    /* Input source config, state (from ex StreamBasedParserBase)
    /**********************************************************
     */

    /**
     * Input stream that can be used for reading more content, if one
     * in use. May be null, if input comes just as a full buffer,
     * or if the stream has been closed.
     */
    protected InputStream _inputStream;

    /**
     * Current buffer from which data is read; generally data is read into
     * buffer from input source, but in some cases pre-loaded buffer
     * is handed to the parser.
     */
    protected byte[] _inputBuffer;

    /**
     * Flag that indicates whether the input buffer is recycable (and
     * needs to be returned to recycler once we are done) or not.
     *

* If it is not, it also means that parser can NOT modify underlying * buffer. */ protected boolean _bufferRecyclable; /* /********************************************************** /* Additional parsing state /********************************************************** */ /** * Type byte of the current token (as in) */ protected int _typeAsInt; /** * Flag that indicates that the current token has not yet * been fully processed, and needs to be finished for * some access (or skipped to obtain the next token) */ protected boolean _tokenIncomplete = false; /* /********************************************************** /* Life-cycle /********************************************************** */ public SmileParser(IOContext ctxt, int parserFeatures, int smileFeatures, ObjectCodec codec, ByteQuadsCanonicalizer sym, InputStream in, byte[] inputBuffer, int start, int end, boolean bufferRecyclable) { super(ctxt, parserFeatures, smileFeatures, sym); _objectCodec = codec; _inputStream = in; _inputBuffer = inputBuffer; _inputPtr = start; _inputEnd = end; _bufferRecyclable = bufferRecyclable; } @Override public ObjectCodec getCodec() { return _objectCodec; } @Override public void setCodec(ObjectCodec c) { _objectCodec = c; } /** * Helper method called when it looks like input might contain the signature; * and it is necessary to detect and handle signature to get configuration * information it might have. * * @return True if valid signature was found and handled; false if not */ protected boolean handleSignature(boolean consumeFirstByte, boolean throwException) throws IOException { if (consumeFirstByte) { ++_inputPtr; } byte b = _nextByteGuaranteed(); if (b != SmileConstants.HEADER_BYTE_2) { if (throwException) { _reportError(String.format( "Malformed content: signature not valid, starts with 0x3A but followed by 0x%02X, not 0x29", b & 0xFF)); } return false; } b = _nextByteGuaranteed(); if (b != SmileConstants.HEADER_BYTE_3) { if (throwException) { _reportError(String.format( "Malformed content: signature not valid, starts with 0x3A, 0x29, but followed by 0x%02X, not 0xA", b & 0xFF)); } return false; } // Good enough; just need version info from 4th byte... int ch = _nextByteGuaranteed(); int versionBits = (ch >> 4) & 0x0F; // but failure with version number is fatal, can not ignore if (versionBits != SmileConstants.HEADER_VERSION_0) { _reportError(String.format( "Header version number bits (0x%X) indicate unrecognized version; only 0x0 accepted by parser", versionBits)); } // can avoid tracking names, if explicitly disabled if ((ch & SmileConstants.HEADER_BIT_HAS_SHARED_NAMES) == 0) { _seenNames = null; _seenNameCount = -1; } // conversely, shared string values must be explicitly enabled if ((ch & SmileConstants.HEADER_BIT_HAS_SHARED_STRING_VALUES) != 0) { _seenStringValues = NO_STRINGS; _seenStringValueCount = 0; } _mayContainRawBinary = ((ch & SmileConstants.HEADER_BIT_HAS_RAW_BINARY) != 0); return true; } /* /********************************************************** /* Former StreamBasedParserBase methods /********************************************************** */ @Override public int releaseBuffered(OutputStream out) throws IOException { int count = _inputEnd - _inputPtr; if (count < 1) { return 0; } // let's just advance ptr to end int origPtr = _inputPtr; out.write(_inputBuffer, origPtr, count); return count; } @Override public Object getInputSource() { return _inputStream; } /* /********************************************************** /* Low-level reading, other /********************************************************** */ // @since 2.8 private final byte _nextByteGuaranteed() throws IOException { int ptr = _inputPtr; if (ptr < _inputEnd) { byte b = _inputBuffer[ptr]; _inputPtr = ptr+1; return b; } _loadMoreGuaranteed(); return _inputBuffer[_inputPtr++]; } protected final void _loadMoreGuaranteed() throws IOException { if (!_loadMore()) { _reportInvalidEOF(); } } protected final boolean _loadMore() throws IOException { //_currInputRowStart -= _inputEnd; if (_inputStream != null) { int count = _inputStream.read(_inputBuffer, 0, _inputBuffer.length); _currInputProcessed += _inputEnd; _streamReadConstraints.validateDocumentLength(_currInputProcessed); _inputPtr = 0; if (count > 0) { _inputEnd = count; return true; } // important: move pointer to same as end, to keep location accurate _inputEnd = 0; // End of input _closeInput(); // Should never return 0, so let's fail if (count == 0) { throw new IOException("InputStream.read() returned 0 characters when trying to read "+_inputBuffer.length+" bytes"); } } return false; } /** * Helper method that will try to load at least specified number bytes in * input buffer, possible moving existing data around if necessary. * Exception throws if not enough content can be read. * * @param minAvailable Minimum number of bytes we absolutely need * * @throws IOException if read failed, either due to I/O issue or because not * enough content could be read before end-of-input. */ protected final void _loadToHaveAtLeast(int minAvailable) throws IOException { // No input stream, no leading (either we are closed, or have non-stream input source) if (_inputStream == null) { throw _constructError(String.format( "Needed to read %d bytes, reached end-of-input", minAvailable)); } int missing = _tryToLoadToHaveAtLeast(minAvailable); if (missing > 0) { throw _constructError(String.format( "Needed to read %d bytes, only got %d before end-of-input", minAvailable, minAvailable - missing)); } } /** * Helper method that will try to load at least specified number bytes in * input buffer, possible moving existing data around if necessary. * * @return Number of bytes that were missing, if any; {@code 0} for successful * read * * @since 2.12.3 */ protected final int _tryToLoadToHaveAtLeast(int minAvailable) throws IOException { if (_inputStream == null) { return minAvailable; } // Need to move remaining data in front? int amount = _inputEnd - _inputPtr; _currInputProcessed += _inputPtr; _streamReadConstraints.validateDocumentLength(_currInputProcessed); if (amount > 0 && _inputPtr > 0) { //_currInputRowStart -= _inputPtr; System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, amount); _inputEnd = amount; } else { _inputEnd = 0; } _inputPtr = 0; while (_inputEnd < minAvailable) { final int toRead = _inputBuffer.length - _inputEnd; int count = _inputStream.read(_inputBuffer, _inputEnd, toRead); if (count < 1) { // End of input _closeInput(); // Should never return 0, so let's fail if (count == 0) { throw new IOException("InputStream.read() returned 0 characters when trying to read "+amount+" bytes"); } return minAvailable - _inputEnd; } _inputEnd += count; } return 0; } @Override protected void _closeInput() throws IOException { if (_inputStream != null) { if (_ioContext.isResourceManaged() || isEnabled(JsonParser.Feature.AUTO_CLOSE_SOURCE)) { _inputStream.close(); } _inputStream = null; } } /* /********************************************************** /* Overridden methods /********************************************************** */ @Override public boolean hasTextCharacters() { if (_currToken == JsonToken.VALUE_STRING) { // yes; is or can be made available efficiently as char[] return _textBuffer.hasTextAsCharacters(); } if (_currToken == JsonToken.FIELD_NAME) { // not necessarily; possible but: return _nameCopied; } // other types, no benefit from accessing as char[] return false; } /** * Method called to release internal buffers owned by the base * reader. This may be called along with {@link #_closeInput} (for * example, when explicitly closing this reader instance), or * separately (if need be). */ @Override protected void _releaseBuffers2() { if (_bufferRecyclable) { byte[] buf = _inputBuffer; if (buf != null) { _inputBuffer = null; _ioContext.releaseReadIOBuffer(buf); } } } /* /********************************************************** /* JsonParser impl /********************************************************** */ @Override public JsonToken nextToken() throws IOException { _numTypesValid = NR_UNKNOWN; // For longer tokens (text, binary), we'll only read when requested if (_tokenIncomplete) { _skipIncomplete(); } _tokenOffsetForTotal = _inputPtr; // _tokenInputTotal = _currInputProcessed + _inputPtr; // also: clear any data retained so far _binaryValue = null; // Two main modes: values, and field names. if ((_currToken != JsonToken.FIELD_NAME) && _streamReadContext.inObject()) { return _updateToken(_handleFieldName()); } if (_inputPtr >= _inputEnd) { if (!_loadMore()) { return _eofAsNextToken(); } } final int ch = _inputBuffer[_inputPtr++] & 0xFF; _typeAsInt = ch; switch (ch >> 5) { case 0: // short shared string value reference if (ch != 0) { // 0x0 is invalid return _handleSharedString(ch-1); } break; case 1: // simple literals, numbers { int typeBits = ch & 0x1F; if (typeBits < 4) { switch (typeBits) { case 0x00: _textBuffer.resetWithEmpty(); return _updateToken(JsonToken.VALUE_STRING); case 0x01: return _updateToken(JsonToken.VALUE_NULL); case 0x02: // false return _updateToken(JsonToken.VALUE_FALSE); default: // 0x03 == true return _updateToken(JsonToken.VALUE_TRUE); } } if (typeBits == 4) { _finishInt(); return _updateToken(JsonToken.VALUE_NUMBER_INT); } // next 3 bytes define subtype if (typeBits <= 6) { // VInt (zigzag), BigInteger _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_NUMBER_INT); } if (typeBits < 11 && typeBits != 7) { // floating-point _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_NUMBER_FLOAT); } if (typeBits == 0x1A) { // == 0x3A == ':' -> possibly header signature for next chunk? if (handleSignature(false, false)) { // Ok, now; end-marker and header both imply doc boundary and a // 'null token'; but if both are seen, they are collapsed. // We can check this by looking at current token; if it's null, // need to get non-null token // 30-Mar-2021, tatu: [dataformats-binary#268] Let's verify we // handle repeated back-to-back headers separately if (_currToken == null) { return _nextAfterHeader(); } return _updateTokenToNull(); } _reportError("Unrecognized token byte 0x3A (malformed segment header?"); } } // and everything else is reserved, for now break; case 2: // tiny ASCII // fall through case 3: // short ASCII // fall through case 4: // tiny Unicode // fall through case 5: // short Unicode // No need to decode, unless we have to keep track of back-references (for shared string values) if (_seenStringValueCount >= 0) { // shared text values enabled return _addSeenStringValue(); } _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_STRING); case 6: // small integers; zigzag encoded _numberInt = SmileUtil.zigzagDecode(ch & 0x1F); _numTypesValid = NR_INT; _numberType = NumberType.INT; return _updateToken(JsonToken.VALUE_NUMBER_INT); case 7: // binary/long-text/long-shared/start-end-markers switch (ch & 0x1F) { case 0x00: // long variable length ASCII case 0x04: // long variable length unicode _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_STRING); case 0x08: // binary, 7-bit (0xE8) _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_EMBEDDED_OBJECT); case 0x0C: // long shared string (0xEC) case 0x0D: case 0x0E: case 0x0F: if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } return _handleSharedString(((ch & 0x3) << 8) + (_inputBuffer[_inputPtr++] & 0xFF)); case 0x18: // START_ARRAY createChildArrayContext(-1, -1); return _updateToken(JsonToken.START_ARRAY); case 0x19: // END_ARRAY if (!_streamReadContext.inArray()) { _reportMismatchedEndMarker(']', '}'); } _streamReadContext = _streamReadContext.getParent(); return _updateToken(JsonToken.END_ARRAY); case 0x1A: // START_OBJECT createChildObjectContext(-1, -1); return _updateToken(JsonToken.START_OBJECT); case 0x1B: // not used in this mode; would be END_OBJECT _reportError("Invalid type marker byte 0xFB in value mode (would be END_OBJECT in key mode)"); case 0x1D: // binary, raw _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_EMBEDDED_OBJECT); case 0x1F: // 0xFF, end of content return _updateTokenToNull(); } break; } // If we get this far, type byte is corrupt return _reportUnknownValueTypeToken(ch); } // should we change description based on reserved vs illegal? (special cases // of null bytes etc) private JsonToken _reportUnknownValueTypeToken(int ch) throws IOException { throw _constructReadException("Invalid type marker byte 0x%s for expected value token", Integer.toHexString(ch & 0xFF)); } // Helper method called in situations where Smile Header was encountered // and "current token" is `null`. This can occur both right after document-end // marker (normal situation) and immediately at the beginning of document // (repeated header markers). Normally we'll want to find the real next token // but will not want to do infinite recursion for abnormal case of a very long // sequence of repeated header markers. To guard against that, only call // recursively if we know next token cannot be header; checking that is simple // enough // // @since 2.12.3 private JsonToken _nextAfterHeader() throws IOException { if ((_inputPtr < _inputEnd) || _loadMore()) { if (_inputBuffer[_inputPtr] == SmileConstants.HEADER_BYTE_1) { // danger zone; just set and return null token return _updateTokenToNull(); } } // Otherwise safe enough to do recursion return nextToken(); } private final JsonToken _handleSharedString(int index) throws IOException { if (index >= _seenStringValueCount) { _reportInvalidSharedStringValue(index); } _textBuffer.resetWithString(_seenStringValues[index]); return _updateToken(JsonToken.VALUE_STRING); } private final JsonToken _addSeenStringValue() throws IOException { _finishToken(); String v = _textBuffer.contentsAsString(); if (_seenStringValueCount < _seenStringValues.length) { // !!! TODO: actually only store char[], first time around? _seenStringValues[_seenStringValueCount++] = v; } else { _expandSeenStringValues(v); } return _updateToken(JsonToken.VALUE_STRING); } private final void _expandSeenStringValues(String newText) { String[] oldShared = _seenStringValues; int len = oldShared.length; String[] newShared; if (len == 0) { newShared = new String[DEFAULT_STRING_VALUE_BUFFER_LENGTH]; } else if (len == SmileConstants.MAX_SHARED_STRING_VALUES) { // too many? Just flush... newShared = oldShared; _seenStringValueCount = 0; // could also clear, but let's not yet bother } else { int newSize = (len == DEFAULT_NAME_BUFFER_LENGTH) ? 256 : SmileConstants.MAX_SHARED_STRING_VALUES; newShared = Arrays.copyOf(oldShared, newSize); } _seenStringValues = newShared; _seenStringValues[_seenStringValueCount++] = newText; } /** * Method for forcing full read of current token, even if it might otherwise * only be read if data is accessed via {@link #getText} and similar methods. */ @Override public void finishToken() throws IOException { if (_tokenIncomplete) { _finishToken(); } } /* /********************************************************** /* Optimized accessors, isXxx, nextXxx (except for nextToken() /********************************************************** */ // Not (yet?) overridden, as of 2.6 /* public boolean hasTokenId(int id) { return super.hasTokenId(id); } */ //public boolean isExpectedStartArrayToken() { return currentToken() == JsonToken.START_ARRAY; } //public boolean isExpectedStartObjectToken() { return currentToken() == JsonToken.START_OBJECT; } @Override public boolean nextFieldName(SerializableString str) throws IOException { // Two parsing modes; can only succeed if expecting field name, so handle that first: if (_currToken != JsonToken.FIELD_NAME && _streamReadContext.inObject()) { // first, clear up state _numTypesValid = NR_UNKNOWN; if (_tokenIncomplete) { _skipIncomplete(); } _tokenOffsetForTotal = _inputPtr; _binaryValue = null; byte[] nameBytes = str.asQuotedUTF8(); final int byteLen = nameBytes.length; // need room for type byte, name bytes, possibly end marker, so: if ((_inputPtr + byteLen + 1) < _inputEnd) { // maybe... int ptr = _inputPtr; int ch = _inputBuffer[ptr++] & 0xFF; _typeAsInt = ch; main_switch: switch (ch >> 6) { case 0: // misc, including end marker switch (ch) { case 0x20: // empty String as name, legal if unusual _updateToken(JsonToken.FIELD_NAME); _inputPtr = ptr; _streamReadContext.setCurrentName(""); return (byteLen == 0); case 0x30: // long shared case 0x31: case 0x32: case 0x33: { int index = ((ch & 0x3) << 8) + (_inputBuffer[ptr++] & 0xFF); if (index >= _seenNameCount) { _reportInvalidSharedName(index); } final String name = _seenNames[index]; // lgtm [java/dereferenced-value-may-be-null] _streamReadContext.setCurrentName(name); _inputPtr = ptr; _updateToken(JsonToken.FIELD_NAME); return name.equals(str.getValue()); } //case 0x34: // long ASCII/Unicode name; let's not even try... } break; case 1: // short shared, can fully process { int index = (ch & 0x3F); if (index >= _seenNameCount) { _reportInvalidSharedName(index); } final String name = _seenNames[index]; // lgtm [java/dereferenced-value-may-be-null] _streamReadContext.setCurrentName(name); _inputPtr = ptr; _updateToken(JsonToken.FIELD_NAME); return name.equals(str.getValue()); } case 2: // short ASCII { int len = 1 + (ch & 0x3f); if (len == byteLen) { int i = 0; for (; i < len; ++i) { if (nameBytes[i] != _inputBuffer[ptr+i]) { break main_switch; } } // yes, does match... _inputPtr = ptr + len; final String name = str.getValue(); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); _updateToken(JsonToken.FIELD_NAME); return true; } } break; case 3: // short Unicode // all valid, except for 0xFF { int len = (ch & 0x3F); if (len > 0x37) { if (len == 0x3B) { _updateToken(JsonToken.END_OBJECT); // 21-Mar-2021, tatu: We have `inObject()`, checked already _inputPtr = ptr; _streamReadContext = _streamReadContext.getParent(); return false; } // error, but let's not worry about that here break; } len += 2; // values from 2 to 57... if (len == byteLen) { int i = 0; for (; i < len; ++i) { if (nameBytes[i] != _inputBuffer[ptr+i]) { break main_switch; } } // yes, does match... _inputPtr = ptr + len; final String name = str.getValue(); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); _updateToken(JsonToken.FIELD_NAME); return true; } } break; } } // wouldn't fit in buffer, just fall back to default processing } // otherwise just fall back to default handling; should occur rarely return (nextToken() == JsonToken.FIELD_NAME) && str.getValue().equals(currentName()); } @Override public String nextFieldName() throws IOException { // Two parsing modes; can only succeed if expecting field name, so handle that first: if (_currToken != JsonToken.FIELD_NAME && _streamReadContext.inObject()) { // first, clear up state _numTypesValid = NR_UNKNOWN; if (_tokenIncomplete) { _skipIncomplete(); } _tokenOffsetForTotal = _inputPtr; _binaryValue = null; if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int ch = _inputBuffer[_inputPtr++] & 0xFF; // is this needed? _typeAsInt = ch; switch (ch >> 6) { case 0: // misc, including end marker switch (ch) { case 0x20: // empty String as name, legal if unusual _streamReadContext.setCurrentName(""); _updateToken(JsonToken.FIELD_NAME); return ""; case 0x30: // long shared case 0x31: case 0x32: case 0x33: if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } { int index = ((ch & 0x3) << 8) + (_inputBuffer[_inputPtr++] & 0xFF); if (index >= _seenNameCount) { _reportInvalidSharedName(index); } final String name = _seenNames[index]; // lgtm [java/dereferenced-value-may-be-null] _streamReadContext.setCurrentName(name); _updateToken(JsonToken.FIELD_NAME); return name; } case 0x34: // long ASCII/Unicode name _updateToken(JsonToken.FIELD_NAME); return _handleLongFieldName(); } break; case 1: // short shared, can fully process { int index = (ch & 0x3F); if (index >= _seenNameCount) { _reportInvalidSharedName(index); } final String name = _seenNames[index]; // lgtm [java/dereferenced-value-may-be-null] _streamReadContext.setCurrentName(name); _updateToken(JsonToken.FIELD_NAME); return name; } case 2: // short ASCII { final int len = 1 + (ch & 0x3f); final String name = _findOrDecodeShortAsciiName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); _updateToken(JsonToken.FIELD_NAME); return name; } case 3: // short Unicode // all valid, except for 0xFF ch &= 0x3F; { if (ch > 0x37) { if (ch == 0x3B) { // 21-Mar-2021, tatu: We have `inObject()`, checked already _streamReadContext = _streamReadContext.getParent(); _updateToken(JsonToken.END_OBJECT); return null; } } else { final int len = ch + 2; // values from 2 to 57... final String name = _findOrDecodeShortUnicodeName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); _updateToken(JsonToken.FIELD_NAME); return name; } } break; } // Other byte values are illegal return _reportUnknownNameToken(_typeAsInt); } // otherwise just fall back to default handling; should occur rarely return (nextToken() == JsonToken.FIELD_NAME) ? currentName() : null; } // Should we try to give more information on kind of problem (reserved vs // illegal by definition; suggesting likely problem) private String _reportUnknownNameToken(int ch) throws IOException { throw _constructReadException("Invalid type marker byte 0x%s for expected field name (or END_OBJECT marker)", Integer.toHexString(ch & 0xFF)); } @Override public String nextTextValue() throws IOException { // can't get text value if expecting name, so if (!_streamReadContext.inObject() || _currToken == JsonToken.FIELD_NAME) { if (_tokenIncomplete) { _skipIncomplete(); } int ptr = _inputPtr; if (ptr >= _inputEnd) { if (!_loadMore()) { _eofAsNextToken(); return null; } ptr = _inputPtr; } _tokenOffsetForTotal = ptr; // _tokenInputTotal = _currInputProcessed + _inputPtr; int ch = _inputBuffer[ptr++] & 0xFF; _typeAsInt = ch; // also: clear any data retained so far _binaryValue = null; switch (ch >> 5) { case 0: // short shared string value reference if (ch != 0) { // _handleSharedString... --ch; if (ch >= _seenStringValueCount) { _reportInvalidSharedStringValue(ch); } _inputPtr = ptr; String text = _seenStringValues[ch]; _textBuffer.resetWithString(text); _updateToken(JsonToken.VALUE_STRING); return text; } else { // important: this is invalid, don't accept _reportError("Invalid token byte 0x00"); } case 1: // simple literals, numbers { int typeBits = ch & 0x1F; if (typeBits == 0x00) { _inputPtr = ptr; _textBuffer.resetWithEmpty(); _updateToken(JsonToken.VALUE_STRING); return ""; } } break; case 2: // tiny ASCII // fall through case 3: // short ASCII _updateToken(JsonToken.VALUE_STRING); _inputPtr = ptr; { final String text = _decodeShortAsciiValue(1 + (ch & 0x3F)); if (_seenStringValueCount >= 0) { // shared text values enabled if (_seenStringValueCount < _seenStringValues.length) { _seenStringValues[_seenStringValueCount++] = text; } else { _expandSeenStringValues(text); } } return text; } case 4: // tiny Unicode // fall through case 5: // short Unicode _updateToken(JsonToken.VALUE_STRING); _inputPtr = ptr; { final String text = _decodeShortUnicodeValue(2 + (ch & 0x3F)); if (_seenStringValueCount >= 0) { // shared text values enabled if (_seenStringValueCount < _seenStringValues.length) { _seenStringValues[_seenStringValueCount++] = text; } else { _expandSeenStringValues(text); } } return text; } case 6: // small integers; zigzag encoded break; case 7: // binary/long-text/long-shared/start-end-markers // TODO: support longer strings too? /* switch (ch & 0x1F) { case 0x00: // long variable length ASCII case 0x04: // long variable length unicode _tokenIncomplete = true; return _updateToken(JsonToken.VALUE_STRING); case 0x08: // binary, 7-bit break main; case 0x0C: // long shared string case 0x0D: case 0x0E: case 0x0F: if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } return _handleSharedString(((ch & 0x3) << 8) + (_inputBuffer[_inputPtr++] & 0xFF)); } break; */ break; } } // otherwise fall back to generic handling (note: we do NOT assign 'ptr') return (nextToken() == JsonToken.VALUE_STRING) ? getText() : null; } @Override public int nextIntValue(int defaultValue) throws IOException { if (nextToken() == JsonToken.VALUE_NUMBER_INT) { return getIntValue(); } return defaultValue; } @Override public long nextLongValue(long defaultValue) throws IOException { if (nextToken() == JsonToken.VALUE_NUMBER_INT) { return getLongValue(); } return defaultValue; } @Override public Boolean nextBooleanValue() throws IOException { JsonToken t = nextToken(); if (t == JsonToken.VALUE_TRUE) { return Boolean.TRUE; } if (t == JsonToken.VALUE_FALSE) { return Boolean.FALSE; } return null; } /* /********************************************************** /* Public API, access to token information, text /********************************************************** */ /** * Method for accessing textual representation of the current event; * if no current event (before first call to {@link #nextToken}, or * after encountering end-of-input), returns null. * Method can be called for any event. */ @Override public String getText() throws IOException { if (_tokenIncomplete) { _tokenIncomplete = false; // Let's inline part of "_finishToken", common case int tb = _typeAsInt; int type = (tb >> 5); if (type == 2 || type == 3) { // tiny & short ASCII return _decodeShortAsciiValue(1 + (tb & 0x3F)); } if (type == 4 || type == 5) { // tiny & short Unicode // short unicode; note, lengths 2 - 65 (off-by-one compared to ASCII) return _decodeShortUnicodeValue(2 + (tb & 0x3F)); } _finishToken(); } if (_currToken == JsonToken.VALUE_STRING) { return _textBuffer.contentsAsString(); } JsonToken t = _currToken; if (t == null) { // null only before/after document return null; } if (t == JsonToken.FIELD_NAME) { return _streamReadContext.getCurrentName(); } if (t.isNumeric()) { // TODO: optimize? return getNumberValue().toString(); } return _currToken.asString(); } @Override public char[] getTextCharacters() throws IOException { if (_currToken != null) { // null only before/after document if (_tokenIncomplete) { _finishToken(); } if (_currToken == JsonToken.VALUE_STRING) { return _textBuffer.getTextBuffer(); } if (_currToken == JsonToken.FIELD_NAME) { if (!_nameCopied) { String name = _streamReadContext.getCurrentName(); int nameLen = name.length(); if (_nameCopyBuffer == null) { _nameCopyBuffer = _ioContext.allocNameCopyBuffer(nameLen); } else if (_nameCopyBuffer.length < nameLen) { _nameCopyBuffer = new char[nameLen]; } name.getChars(0, nameLen, _nameCopyBuffer, 0); _nameCopied = true; } return _nameCopyBuffer; } if (_currToken.isNumeric()) { // TODO: optimize? return getNumberValue().toString().toCharArray(); } return _currToken.asCharArray(); } return null; } @Override public int getTextLength() throws IOException { if (_currToken != null) { // null only before/after document if (_tokenIncomplete) { _finishToken(); } if (_currToken == JsonToken.VALUE_STRING) { return _textBuffer.size(); } if (_currToken == JsonToken.FIELD_NAME) { return _streamReadContext.getCurrentName().length(); } if ((_currToken == JsonToken.VALUE_NUMBER_INT) || (_currToken == JsonToken.VALUE_NUMBER_FLOAT)) { // TODO: optimize return getNumberValue().toString().length(); } final char[] ch = _currToken.asCharArray(); if (ch != null) { return ch.length; } } return 0; } @Override public int getTextOffset() throws IOException { return 0; } @Override public String getValueAsString() throws IOException { // inlined 'getText()' for common case of having String if (_tokenIncomplete) { _tokenIncomplete = false; int tb = _typeAsInt; int type = (tb >> 5); if (type == 2 || type == 3) { // tiny & short ASCII return _decodeShortAsciiValue(1 + (tb & 0x3F)); } if (type == 4 || type == 5) { // tiny & short Unicode return _decodeShortUnicodeValue(2 + (tb & 0x3F)); } _finishToken(); } if (_currToken == JsonToken.VALUE_STRING) { return _textBuffer.contentsAsString(); } if (_currToken == null || _currToken == JsonToken.VALUE_NULL || !_currToken.isScalarValue()) { return null; } return getText(); } @Override public String getValueAsString(String defaultValue) throws IOException { if (_currToken != JsonToken.VALUE_STRING) { if (_currToken == null || _currToken == JsonToken.VALUE_NULL || !_currToken.isScalarValue()) { return defaultValue; } } return getText(); } @Override // since 2.8 public int getText(Writer writer) throws IOException { if (_tokenIncomplete) { _finishToken(); } JsonToken t = _currToken; if (t == JsonToken.VALUE_STRING) { return _textBuffer.contentsToWriter(writer); } if (t == JsonToken.FIELD_NAME) { String n = _streamReadContext.getCurrentName(); writer.write(n); return n.length(); } if (t != null) { if (t.isNumeric()) { return _textBuffer.contentsToWriter(writer); } char[] ch = t.asCharArray(); writer.write(ch); return ch.length; } return 0; } /* /********************************************************** /* Public API, access to token information, binary /********************************************************** */ @Override public byte[] getBinaryValue(Base64Variant b64variant) throws IOException { if (_currToken == JsonToken.VALUE_EMBEDDED_OBJECT ) { if (_tokenIncomplete) { _finishToken(); } } else if (_currToken == JsonToken.VALUE_STRING) { return _getBinaryFromString(b64variant); } else { throw _constructReadException( "Current token (%s) not VALUE_EMBEDDED_OBJECT or VALUE_STRING, can not access as binary", currentToken()); } return _binaryValue; } @Override public Object getEmbeddedObject() throws IOException { if (_tokenIncomplete) { _finishToken(); } if (_currToken == JsonToken.VALUE_EMBEDDED_OBJECT) { return _binaryValue; } return null; } @Override public int readBinaryValue(Base64Variant b64variant, OutputStream out) throws IOException { if (_currToken != JsonToken.VALUE_EMBEDDED_OBJECT) { if (_currToken == JsonToken.VALUE_STRING) { // 26-Jun-2021, tatu: Not optimized; could make streaming if we // really want in future final byte[] b = _getBinaryFromString(b64variant); final int len = b.length; out.write(b, 0, len); return len; } throw _constructReadException( "Current token (%s) not VALUE_EMBEDDED_OBJECT or VALUE_STRING, can not access as binary", currentToken()); } // Ok, first, unlikely (but legal?) case where someone already requested binary data: if (!_tokenIncomplete) { if (_binaryValue == null) { // most likely already read... return 0; } final int len = _binaryValue.length; out.write(_binaryValue, 0, len); return len; } // otherwise, handle, mark as complete // first, raw inlined binary data (simple) if (_typeAsInt == SmileConstants.INT_MISC_BINARY_RAW) { final int totalCount = _readUnsignedVInt(); int left = totalCount; while (left > 0) { int avail = _inputEnd - _inputPtr; if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); avail = _inputEnd - _inputPtr; } int count = Math.min(avail, left); out.write(_inputBuffer, _inputPtr, count); _inputPtr += count; left -= count; } _tokenIncomplete = false; return totalCount; } if (_typeAsInt != SmileConstants.INT_MISC_BINARY_7BIT) { _throwInternal(); } // or, alternative, 7-bit encoded stuff: final int totalCount = _readUnsignedVInt(); byte[] encodingBuffer = _ioContext.allocBase64Buffer(); try { _readBinaryEncoded(out, totalCount, encodingBuffer); } finally { _ioContext.releaseBase64Buffer(encodingBuffer); } _tokenIncomplete = false; return totalCount; } private void _readBinaryEncoded(OutputStream out, int length, byte[] buffer) throws IOException { int outPtr = 0; final int lastSafeOut = buffer.length - 7; // first handle all full 7/8 units while (length > 7) { if ((_inputEnd - _inputPtr) < 8) { _loadToHaveAtLeast(8); } int i1 = (_inputBuffer[_inputPtr++] << 25) + (_inputBuffer[_inputPtr++] << 18) + (_inputBuffer[_inputPtr++] << 11) + (_inputBuffer[_inputPtr++] << 4); int x = _inputBuffer[_inputPtr++]; i1 += x >> 3; int i2 = ((x & 0x7) << 21) + (_inputBuffer[_inputPtr++] << 14) + (_inputBuffer[_inputPtr++] << 7) + _inputBuffer[_inputPtr++]; // Ok: got our 7 bytes, just need to split, copy buffer[outPtr++] = (byte)(i1 >> 24); buffer[outPtr++] = (byte)(i1 >> 16); buffer[outPtr++] = (byte)(i1 >> 8); buffer[outPtr++] = (byte)i1; buffer[outPtr++] = (byte)(i2 >> 16); buffer[outPtr++] = (byte)(i2 >> 8); buffer[outPtr++] = (byte)i2; length -= 7; // ensure there's always room for at least 7 bytes more after looping: if (outPtr > lastSafeOut) { out.write(buffer, 0, outPtr); outPtr = 0; } } // and then leftovers: n+1 bytes to decode n bytes if (length > 0) { if ((_inputEnd - _inputPtr) < (length+1)) { _loadToHaveAtLeast(length+1); } int value = _inputBuffer[_inputPtr++]; for (int i = 1; i < length; ++i) { value = (value << 7) + _inputBuffer[_inputPtr++]; buffer[outPtr++] = (byte) (value >> (7 - i)); } // last byte is different, has remaining 1 - 6 bits, right-aligned value <<= length; buffer[outPtr++] = (byte) (value + _inputBuffer[_inputPtr++]); } if (outPtr > 0) { out.write(buffer, 0, outPtr); } } // @since 2.13 private final byte[] _getBinaryFromString(Base64Variant variant) throws IOException { if (_tokenIncomplete) { _finishToken(); } if (_binaryValue == null) { // 26-Jun-2021, tatu: Copied from ParserBase (except no recycling of BAB here) ByteArrayBuilder builder = new ByteArrayBuilder(); _decodeBase64(getText(), builder, variant); _binaryValue = builder.toByteArray(); } return _binaryValue; } /* /********************************************************** /* Internal methods, field name parsing /********************************************************** */ /** * Method that handles initial token type recognition for token * that has to be either FIELD_NAME or END_OBJECT. */ protected final JsonToken _handleFieldName() throws IOException { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int ch = _inputBuffer[_inputPtr++] & 0xFF; // is this needed? _typeAsInt = ch; switch (ch >> 6) { case 0: // misc, including end marker switch (ch) { case 0x20: // empty String as name, legal if unusual _streamReadContext.setCurrentName(""); return JsonToken.FIELD_NAME; case 0x30: // long shared case 0x31: case 0x32: case 0x33: if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } { int index = ((ch & 0x3) << 8) + (_inputBuffer[_inputPtr++] & 0xFF); if (index >= _seenNameCount) { _reportInvalidSharedName(index); } _streamReadContext.setCurrentName(_seenNames[index]); // lgtm [java/dereferenced-value-may-be-null] } return JsonToken.FIELD_NAME; case 0x34: // long ASCII/Unicode name _handleLongFieldName(); return JsonToken.FIELD_NAME; } break; case 1: // short shared, can fully process { int index = (ch & 0x3F); if (index >= _seenNameCount) { _reportInvalidSharedName(index); } _streamReadContext.setCurrentName(_seenNames[index]); // lgtm [java/dereferenced-value-may-be-null] } return JsonToken.FIELD_NAME; case 2: // short ASCII { final int len = 1 + (ch & 0x3f); final String name = _findOrDecodeShortAsciiName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); } return JsonToken.FIELD_NAME; case 3: // short Unicode // all valid, except for 0xFF ch &= 0x3F; { if (ch > 0x37) { if (ch == 0x3B) { if (!_streamReadContext.inObject()) { _reportMismatchedEndMarker('}', ']'); } _streamReadContext = _streamReadContext.getParent(); return JsonToken.END_OBJECT; } } else { final int len = ch + 2; // values from 2 to 57... final String name = _findOrDecodeShortUnicodeName(len); if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); return JsonToken.FIELD_NAME; } } break; } // Other byte values are illegal throw _constructReadException("Invalid type marker byte 0x%s for expected field name (or END_OBJECT marker)", Integer.toHexString(_typeAsInt)); } private String _findOrDecodeShortAsciiName(final int len) throws IOException { // First things first: must ensure all in buffer if ((_inputEnd - _inputPtr) < len) { _loadToHaveAtLeast(len); } if (_symbolsCanonical) { String name = _findDecodedFromSymbols(len); if (name != null) { _inputPtr += len; } else { name = _decodeShortAsciiName(len); name = _addDecodedToSymbols(len, name); } return name; } // if not canonicalizing, much simpler: return _decodeShortAsciiName(len); } private String _findOrDecodeShortUnicodeName(final int len) throws IOException { // First things first: must ensure all in buffer if ((_inputEnd - _inputPtr) < len) { _loadToHaveAtLeast(len); } if (_symbolsCanonical) { String name = _findDecodedFromSymbols(len); if (name != null) { _inputPtr += len; } else { name = _decodeShortUnicodeName(len); name = _addDecodedToSymbols(len, name); } return name; } // if not canonicalizing, much simpler: return _decodeShortUnicodeName(len); } /** * Method called to try to expand shared name area to fit one more potentially * shared String. If area is already at its biggest size, will just clear * the area (by setting next-offset to 0) */ private final String[] _expandSeenNames(String[] oldShared) { int len = oldShared.length; String[] newShared; if (len == 0) { newShared = new String[DEFAULT_NAME_BUFFER_LENGTH]; } else if (len == SmileConstants.MAX_SHARED_NAMES) { // too many? Just flush... newShared = oldShared; _seenNameCount = 0; // could also clear, but let's not yet bother } else { int newSize = (len == DEFAULT_STRING_VALUE_BUFFER_LENGTH) ? 256 : SmileConstants.MAX_SHARED_NAMES; newShared = Arrays.copyOf(oldShared, newSize); } return newShared; } private final String _addDecodedToSymbols(int len, String name) throws IOException { if (len < 5) { return _symbols.addName(name, _quad1); } if (len < 9) { return _symbols.addName(name, _quad1, _quad2); } if (len < 13) { return _symbols.addName(name, _quad1, _quad2, _quad3); } int qlen = (len + 3) >> 2; return _symbols.addName(name, _quadBuffer, qlen); } private final String _decodeShortAsciiName(int len) throws IOException { // note: caller ensures we have enough bytes available // also note that since it's a short name (64 bytes), segment WILL have enough space if (JDK11_OR_LATER) { // On newer JDKs the String internals changed and for ASCII strings the constructor // that takes a byte array can be used and internally is just Arrays.copyOfRange. final int inPtr = _inputPtr; _inputPtr = inPtr + len; String str = new String(_inputBuffer, inPtr, len, StandardCharsets.US_ASCII); _textBuffer.resetWithString(str); return str; } else { char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int outPtr = 0; final byte[] inBuf = _inputBuffer; int inPtr = _inputPtr; for (int inEnd = inPtr + len; inPtr < inEnd; ++inPtr) { outBuf[outPtr++] = (char) inBuf[inPtr]; } _inputPtr = inPtr; return _textBuffer.setCurrentAndReturn(len); } } /** * Helper method used to decode short Unicode string, length for which actual * length (in bytes) is known * * @param len Length between 1 and 64 */ private final String _decodeShortUnicodeName(int len) throws IOException { // note: caller ensures we have enough bytes available int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int inPtr = _inputPtr; final int[] codes = SmileConstants.sUtf8UnitLengths; final byte[] inBuf = _inputBuffer; for (final int end = inPtr + len; inPtr < end; ) { int i = inBuf[inPtr++] & 0xFF; int code = codes[i]; if (code != 0) { // 08-Jul-2021, tatu: As per [dataformats-binary#] need to // be careful wrt end-of-buffer truncated codepoints if ((inPtr + code) > end) { final int firstCharOffset = len - (end - inPtr) - 1; _reportTruncatedUTF8InName(len, firstCharOffset, i, code); } // trickiest one, need surrogate handling switch (code) { case 1: i = ((i & 0x1F) << 6) | (inBuf[inPtr++] & 0x3F); break; case 2: i = ((i & 0x0F) << 12) | ((inBuf[inPtr++] & 0x3F) << 6) | (inBuf[inPtr++] & 0x3F); break; case 3: i = ((i & 0x07) << 18) | ((inBuf[inPtr++] & 0x3F) << 12) | ((inBuf[inPtr++] & 0x3F) << 6) | (inBuf[inPtr++] & 0x3F); // note: this is the codepoint value; need to split, too i -= 0x10000; outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); i = 0xDC00 | (i & 0x3FF); break; default: // invalid // Update pointer here to point to (more) correct location _inputPtr = inPtr; throw _constructReadException( "Invalid byte 0x%02X in short Unicode text block", i); } } outBuf[outPtr++] = (char) i; } // let's only update offset here, so error message accurate _inputPtr += len; return _textBuffer.setCurrentAndReturn(outPtr); } // note: slightly edited copy of UTF8StreamParser.addName() private final String _decodeLongUnicodeName(int[] quads, int byteLen, int quadLen, boolean addToSymbolTable) throws IOException { int lastQuadBytes = byteLen & 3; // Ok: must decode UTF-8 chars. No other validation SHOULD be needed (except bounds checks?) // Note: last quad is not correctly aligned (leading zero bytes instead // need to shift a bit, instead of trailing). Only need to shift it // for UTF-8 decoding; need revert for storage (since key will not // be aligned, to optimize lookup speed) // int lastQuad; if (lastQuadBytes > 0) { lastQuad = quads[quadLen-1]; // 8/16/24 bit left shift quads[quadLen-1] = (lastQuad << ((4 - lastQuadBytes) << 3)); } else { lastQuad = 0; } char[] cbuf = _textBuffer.emptyAndGetCurrentSegment(); int cix = 0; for (int ix = 0; ix < byteLen; ) { int ch = quads[ix >> 2]; // current quad, need to shift+mask int byteIx = (ix & 3); ch = (ch >> ((3 - byteIx) << 3)) & 0xFF; ++ix; if (ch > 127) { // multi-byte int needed; if ((ch & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) ch &= 0x1F; needed = 1; } else if ((ch & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) ch &= 0x0F; needed = 2; } else if ((ch & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... ch &= 0x07; needed = 3; } else { // 5- and 6-byte chars not valid chars _reportInvalidInitial(ch); needed = ch = 1; // never really gets this far } if ((ix + needed) > byteLen) { _reportInvalidEOF(" in long field name", JsonToken.FIELD_NAME); } // Ok, always need at least one more: int ch2 = quads[ix >> 2]; // current quad, need to shift+mask byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { _reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); if (needed > 1) { ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { _reportInvalidOther(ch2); } ch = (ch << 6) | (ch2 & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates on output) ch2 = quads[ix >> 2]; byteIx = (ix & 3); ch2 = (ch2 >> ((3 - byteIx) << 3)); ++ix; if ((ch2 & 0xC0) != 0x080) { _reportInvalidOther(ch2 & 0xFF); } ch = (ch << 6) | (ch2 & 0x3F); } } if (needed > 2) { // surrogate pair? once again, let's output one here, one later on ch -= 0x10000; // to normalize it starting with 0x0 if (cix >= cbuf.length) { cbuf = _textBuffer.expandCurrentSegment(); } cbuf[cix++] = (char) (0xD800 + (ch >> 10)); ch = 0xDC00 | (ch & 0x03FF); } } if (cix >= cbuf.length) { cbuf = _textBuffer.expandCurrentSegment(); } cbuf[cix++] = (char) ch; } // Ok. Now we have the character array, and can construct the String String baseName = new String(cbuf, 0, cix); // And finally, un-align if necessary if (lastQuadBytes > 0) { quads[quadLen-1] = lastQuad; } if (addToSymbolTable) { return _symbols.addName(baseName, quads, quadLen); } return baseName; } private final String _handleLongFieldName() throws IOException { // First: gather quads we need, looking for end marker final byte[] inBuf = _inputBuffer; int quads = 0; int bytes = 0; int q = 0; while (true) { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } byte b = inBuf[_inputPtr++]; if (BYTE_MARKER_END_OF_STRING == b) { bytes = 0; break; } q = ((int) b) & 0xFF; if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } b = inBuf[_inputPtr++]; if (BYTE_MARKER_END_OF_STRING == b) { bytes = 1; break; } q = (q << 8) | (b & 0xFF); if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } b = inBuf[_inputPtr++]; if (BYTE_MARKER_END_OF_STRING == b) { bytes = 2; break; } q = (q << 8) | (b & 0xFF); if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } b = inBuf[_inputPtr++]; if (BYTE_MARKER_END_OF_STRING == b) { bytes = 3; break; } q = (q << 8) | (b & 0xFF); if (quads >= _quadBuffer.length) { _quadBuffer = _growArrayTo(_quadBuffer, _quadBuffer.length + 256); // grow by 1k } _quadBuffer[quads++] = q; } // and if we have more bytes, append those too int byteLen = (quads << 2); if (bytes > 0) { if (quads >= _quadBuffer.length) { _quadBuffer = _growArrayTo(_quadBuffer, _quadBuffer.length + 256); } q = _padLastQuad(q, bytes); _quadBuffer[quads++] = q; byteLen += bytes; } // Know this name already? String name = _symbolsCanonical ? _symbols.findName(_quadBuffer, quads) : null; if (name == null) { name = _decodeLongUnicodeName(_quadBuffer, byteLen, quads, _symbolsCanonical); } if (_seenNames != null) { if (_seenNameCount >= _seenNames.length) { _seenNames = _expandSeenNames(_seenNames); } _seenNames[_seenNameCount++] = name; } _streamReadContext.setCurrentName(name); return name; } /** * Helper method for trying to find specified encoded UTF-8 byte sequence * from symbol table; if successful avoids actual decoding to String. *

* NOTE: caller MUST ensure input buffer has enough content. */ private final String _findDecodedFromSymbols(final int len) throws IOException { // First: maybe we already have this name decoded? if (len < 5) { int inPtr = _inputPtr; final byte[] inBuf = _inputBuffer; int q = _padQuadForNulls(inBuf[inPtr]); if (len > 1) { q = (q << 8) + (inBuf[++inPtr] & 0xFF); if (len > 2) { q = (q << 8) + (inBuf[++inPtr] & 0xFF); if (len > 3) { q = (q << 8) + (inBuf[++inPtr] & 0xFF); } } } _quad1 = q; return _symbols.findName(q); } final byte[] inBuf = _inputBuffer; int inPtr = _inputPtr; // First quadbyte is easy int q1 = (inBuf[inPtr++] & 0xFF); q1 = (q1 << 8) | (inBuf[inPtr++] & 0xFF); q1 = (q1 << 8) | (inBuf[inPtr++] & 0xFF); q1 = (q1 << 8) | (inBuf[inPtr++] & 0xFF); if (len < 9) { int q2 = _padQuadForNulls(inBuf[inPtr++]); int left = len - 5; if (left > 0) { q2 = (q2 << 8) + (inBuf[inPtr++] & 0xFF); if (left > 1) { q2 = (q2 << 8) + (inBuf[inPtr++] & 0xFF); if (left > 2) { q2 = (q2 << 8) + (inBuf[inPtr++] & 0xFF); } } } _quad1 = q1; _quad2 = q2; return _symbols.findName(q1, q2); } int q2 = (inBuf[inPtr++] & 0xFF); q2 = (q2 << 8) | (inBuf[inPtr++] & 0xFF); q2 = (q2 << 8) | (inBuf[inPtr++] & 0xFF); q2 = (q2 << 8) | (inBuf[inPtr++] & 0xFF); if (len < 13) { int q3 = _padQuadForNulls(inBuf[inPtr++]); int left = len - 9; if (left > 0) { q3 = (q3 << 8) + (inBuf[inPtr++] & 0xFF); if (left > 1) { q3 = (q3 << 8) + (inBuf[inPtr++] & 0xFF); if (left > 2) { q3 = (q3 << 8) + (inBuf[inPtr++] & 0xFF); } } } _quad1 = q1; _quad2 = q2; _quad3 = q3; return _symbols.findName(q1, q2, q3); } return _findDecodedFixed12(len, q1, q2); } /** * Method for locating names longer than 12 bytes (in UTF-8) */ private final String _findDecodedFixed12(int len, int q1, int q2) throws IOException { // first, need enough buffer to store bytes as ints: { int bufLen = (len + 3) >> 2; if (bufLen > _quadBuffer.length) { _quadBuffer = _growArrayTo(_quadBuffer, bufLen); } } _quadBuffer[0] = q1; _quadBuffer[1] = q2; // then decode, full quads first int offset = 2; int inPtr = _inputPtr+8; len -= 8; final byte[] inBuf = _inputBuffer; do { int q = (inBuf[inPtr++] & 0xFF); q = (q << 8) | inBuf[inPtr++] & 0xFF; q = (q << 8) | inBuf[inPtr++] & 0xFF; q = (q << 8) | inBuf[inPtr++] & 0xFF; _quadBuffer[offset++] = q; } while ((len -= 4) > 3); // and then leftovers if (len > 0) { int q = _padQuadForNulls(inBuf[inPtr]); if (len > 1) { q = (q << 8) + (inBuf[++inPtr] & 0xFF); if (len > 2) { q = (q << 8) + (inBuf[++inPtr] & 0xFF); } } _quadBuffer[offset++] = q; } return _symbols.findName(_quadBuffer, offset); } private static int[] _growArrayTo(int[] arr, int minSize) { final int size = minSize+4; if (arr == null) { return new int[size]; } return Arrays.copyOf(arr, size); } // Helper methods needed to fix [dataformats-binary#312], masking of 0x00 character private final static int _padLastQuad(int q, int bytes) { return (bytes == 4) ? q : (q | (-1 << (bytes << 3))); } private final static int _padQuadForNulls(int firstByte) { return (firstByte & 0xFF) | 0xFFFFFF00; } /* /********************************************************** /* Internal methods, secondary parsing /********************************************************** */ @Override protected void _parseNumericValue() throws IOException { if (!_tokenIncomplete) { _reportError("Internal error: number token (%s) decoded, no value set", _currToken); } _tokenIncomplete = false; int tb = _typeAsInt; // ensure we got a numeric type with value that is lazily parsed if ((tb >> 5) != 1) { _reportError("Current token (%s) not numeric, can not use numeric value accessors", _currToken); } _finishNumberToken(tb); } /* @Override // since 2.6 protected int _parseIntValue() throws IOException { // Inlined variant of: _parseNumericValue(NR_INT) if (_tokenIncomplete) { _tokenIncomplete = false; if ((_typeAsInt & 0x1F) == 4) { _finishInt(); // vint return _numberInt; } _finishNumberToken(_typeAsInt); } if ((_numTypesValid & NR_INT) == 0) { convertNumberToInt(); } return _numberInt; } */ /** * Method called to finish parsing of a token so that token contents * are retrievable */ protected final void _finishToken() throws IOException { _tokenIncomplete = false; int tb = _typeAsInt; int type = (tb >> 5); if (type == 1) { // simple literals, numbers _finishNumberToken(tb); return; } if (type <= 3) { // tiny & short ASCII _decodeShortAsciiValue(1 + (tb & 0x3F)); return; } if (type <= 5) { // tiny & short Unicode // short unicode; note, lengths 2 - 65 (off-by-one compared to ASCII) _decodeShortUnicodeValue(2 + (tb & 0x3F)); return; } if (type == 7) { tb &= 0x1F; // next 3 bytes define subtype switch (tb >> 2) { case 0: // long variable length ASCII _decodeLongAsciiValue(); return; case 1: // long variable length Unicode _decodeLongUnicodeValue(); return; case 2: // binary, 7-bit _binaryValue = _read7BitBinaryWithLength(); return; case 7: // binary, raw _binaryValue = _finishBinaryRaw(); return; } } // sanity check _throwInternal(); } protected final void _finishNumberToken(int tb) throws IOException { switch (tb & 0x1F) { case 4: _finishInt(); // vint return; case 5: // vlong _finishLong(); return; case 6: _finishBigInteger(); return; case 8: // float _finishFloat(); return; case 9: // double _finishDouble(); return; case 10: // big-decimal _finishBigDecimal(); return; } _throwInternal(); } /* /********************************************************** /* Internal methods, secondary Number parsing /********************************************************** */ private final void _finishInt() throws IOException { _numTypesValid = NR_INT; _numberType = NumberType.INT; int ptr = _inputPtr; if ((ptr + 5) >= _inputEnd) { _finishIntSlow(); return; } int value = _inputBuffer[ptr++]; int i; if (value < 0) { // 6 bits value &= 0x3F; } else { i = _inputBuffer[ptr++]; if (i >= 0) { // 13 bits value = (value << 7) + i; i = _inputBuffer[ptr++]; if (i >= 0) { value = (value << 7) + i; i = _inputBuffer[ptr++]; if (i >= 0) { value = (value << 7) + i; // and then we must get negative i = _inputBuffer[ptr++]; if (i >= 0) { _reportError("Corrupt input; 32-bit VInt extends beyond 5 data bytes"); } } } } value = (value << 6) + (i & 0x3F); } _inputPtr = ptr; _numberInt = SmileUtil.zigzagDecode(value); } private final void _finishIntSlow() throws IOException { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int value = _inputBuffer[_inputPtr++]; int i; if (value < 0) { // 6 bits value &= 0x3F; } else { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = _inputBuffer[_inputPtr++]; if (i >= 0) { // 13 bits value = (value << 7) + i; if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = _inputBuffer[_inputPtr++]; if (i >= 0) { value = (value << 7) + i; if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = _inputBuffer[_inputPtr++]; if (i >= 0) { value = (value << 7) + i; // and then we must get negative if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = _inputBuffer[_inputPtr++]; if (i >= 0) { _reportError("Corrupt input; 32-bit VInt extends beyond 5 data bytes"); } } } } value = (value << 6) + (i & 0x3F); } _numberInt = SmileUtil.zigzagDecode(value); } private final void _finishLong() throws IOException { _numTypesValid = NR_LONG; _numberType = NumberType.LONG; int ptr = _inputPtr; final int maxEnd = ptr+11; if (maxEnd >= _inputEnd) { _finishLongSlow(); return; } int i = _inputBuffer[ptr++]; // first 7 bits i = (i << 7) + _inputBuffer[ptr++]; // 14 bits i = (i << 7) + _inputBuffer[ptr++]; // 21 i = (i << 7) + _inputBuffer[ptr++]; // Ok: couple of bytes more long l = i; do { int value = _inputBuffer[ptr++]; if (value < 0) { l = (l << 6) + (value & 0x3F); _inputPtr = ptr; _numberLong = SmileUtil.zigzagDecode(l); return; } l = (l << 7) + value; } while (ptr < maxEnd); _reportError("Corrupt input; 64-bit VInt extends beyond 11 data bytes"); } private final void _finishLongSlow() throws IOException { // Ok, first, will always get 4 full data bytes first; 1 was already passed long l = (long) _fourBytesToInt(); // and loop for the rest while (true) { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int value = _inputBuffer[_inputPtr++]; if (value < 0) { l = (l << 6) + (value & 0x3F); _numberLong = SmileUtil.zigzagDecode(l); return; } l = (l << 7) + value; } } private final int _fourBytesToInt() throws IOException { int ptr = _inputPtr; if ((ptr + 3) >= _inputEnd) { return _fourBytesToIntSlow(); } int i = _inputBuffer[ptr++]; // first 7 bits i = (i << 7) + _inputBuffer[ptr++]; // 14 bits i = (i << 7) + _inputBuffer[ptr++]; // 21 i = (i << 7) + _inputBuffer[ptr++]; _inputPtr = ptr; return i; } private final int _fourBytesToIntSlow() throws IOException { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int i = _inputBuffer[_inputPtr++]; // first 7 bits if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = (i << 7) + _inputBuffer[_inputPtr++]; // 14 bits if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = (i << 7) + _inputBuffer[_inputPtr++]; // 21 if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } return (i << 7) + _inputBuffer[_inputPtr++]; } private final void _finishBigInteger() throws IOException { final byte[] raw = _read7BitBinaryWithLength(); // [dataformats-binary#257]: 0-length special case to handle if (raw.length == 0) { _numberBigInt = BigInteger.ZERO; } else { _streamReadConstraints.validateIntegerLength(raw.length); _numberBigInt = new BigInteger(raw); } _numTypesValid = NR_BIGINT; _numberType = NumberType.BIG_INTEGER; } private final void _finishFloat() throws IOException { // just need 5 bytes to get int32 first; all are unsigned int i = _fourBytesToInt(); if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } i = (i << 7) + _inputBuffer[_inputPtr++]; float f = Float.intBitsToFloat(i); _numberFloat = f; _numberType = NumberType.FLOAT; _numTypesValid = NR_FLOAT; } private final void _finishDouble() throws IOException { // ok; let's take two sets of 4 bytes (each is int) long hi = _fourBytesToInt(); long value = (hi << 28) + (long) _fourBytesToInt(); // and then remaining 2 bytes if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } value = (value << 7) + _inputBuffer[_inputPtr++]; if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } value = (value << 7) + _inputBuffer[_inputPtr++]; _numberDouble = Double.longBitsToDouble(value); _numberType = NumberType.DOUBLE; _numTypesValid = NR_DOUBLE; } private final void _finishBigDecimal() throws IOException { final int scale = SmileUtil.zigzagDecode(_readUnsignedVInt()); final byte[] raw = _read7BitBinaryWithLength(); // [dataformats-binary#257]: 0-length special case to handle if (raw.length == 0) { _numberBigDecimal = BigDecimal.ZERO; } else { _streamReadConstraints.validateFPLength(raw.length); BigInteger unscaledValue = new BigInteger(raw); _numberBigDecimal = new BigDecimal(unscaledValue, scale); } _numTypesValid = NR_BIGDECIMAL; _numberType = NumberType.BIG_DECIMAL; } protected final int _readUnsignedVInt() throws IOException { // 23-Mar-2021, tatu: Let's optimize a bit here: if we have 5 bytes // available, can avoid further boundary checks if ((_inputPtr + 5) > _inputEnd) { return _readUnsignedVIntSlow(); } int ch = _inputBuffer[_inputPtr++]; if (ch < 0) { return ch & 0x3F; } int value = ch; // 2nd byte ch = _inputBuffer[_inputPtr++]; if (ch < 0) { return (value << 6) + (ch & 0x3F); } value = (value << 7) + ch; // 3rd byte ch = _inputBuffer[_inputPtr++]; if (ch < 0) { return (value << 6) + (ch & 0x3F); } value = (value << 7) + ch; // 4th byte ch = _inputBuffer[_inputPtr++]; if (ch < 0) { return (value << 6) + (ch & 0x3F); } value = (value << 7) + ch; // 5th byte ch = _inputBuffer[_inputPtr++]; if ((ch >= 0) // invalid, should end // Must validate no overflow, as well. We can have at most 31 bits // for unsigned int, but with 4 x 7 + 6 == 34 we could have 3 "extra" bits; // at this point we have accumulated 28 bits, so shifting right by 25 should // not leave any 1 bits left: || ((value >>> 25) != 0) // overflow in first byte ) { _reportInvalidUnsignedVInt(value >>> 21, ch); } return (value << 6) + (ch & 0x3F); } // @since 2.12.3 protected final int _readUnsignedVIntSlow() throws IOException { int value = 0; int count = 0; // Read first 4 bytes do { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int ch = _inputBuffer[_inputPtr++]; if (ch < 0) { // last byte value = (value << 6) + (ch & 0x3F); return value; } value = (value << 7) + ch; } while (++count < 4); // but if we need fifth, require validation if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int ch = _inputBuffer[_inputPtr++]; // same validation as in optimized cvase if ((ch >= 0) // invalid, did not end with high-bit set || ((value >>> 25) != 0) // overflow in first byte ) { _reportInvalidUnsignedVInt(value >>> 21, ch); } return (value << 6) + (ch & 0x3F); } protected final void _reportInvalidUnsignedVInt(int firstCh, int lastCh) throws IOException { if (lastCh >= 0) { _reportError( "Overflow in VInt (current token %s): 5th byte (0x%2X) of 5-byte sequence must have its highest bit set to indicate end", currentToken(), lastCh); } _reportError( "Overflow in VInt (current token %s): 1st byte (0x%2X) of 5-byte sequence must have its top 4 bits zeroes", currentToken(), firstCh); } /* /********************************************************** /* Internal methods, secondary String parsing /********************************************************** */ protected final String _decodeShortAsciiValue(int len) throws IOException { if ((_inputEnd - _inputPtr) < len) { _loadToHaveAtLeast(len); } if (JDK11_OR_LATER) { // On newer JDKs the String internals changed and for ASCII strings the constructor // that takes a byte array can be used and internally is just Arrays.copyOfRange. final int inPtr = _inputPtr; _inputPtr = inPtr + len; String str = new String(_inputBuffer, inPtr, len, StandardCharsets.US_ASCII); _textBuffer.resetWithString(str); return str; } else { // Note: we count on fact that buffer must have at least 'len' (<= 64) empty char slots final char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int outPtr = 0; final byte[] inBuf = _inputBuffer; int inPtr = _inputPtr; for (final int end = inPtr + len; inPtr < end; ++inPtr) { outBuf[outPtr++] = (char) inBuf[inPtr]; } _inputPtr = inPtr; return _textBuffer.setCurrentAndReturn(len); } } protected final String _decodeShortUnicodeValue(final int byteLen) throws IOException { if ((_inputEnd - _inputPtr) < byteLen) { _loadToHaveAtLeast(byteLen); } int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int inPtr = _inputPtr; _inputPtr += byteLen; final int[] codes = SmileConstants.sUtf8UnitLengths; final byte[] inputBuf = _inputBuffer; for (int end = inPtr + byteLen; inPtr < end; ) { int i = inputBuf[inPtr++]; if (i >= 0) { outBuf[outPtr++] = (char) i; continue; } i &= 0xFF; final int unitLen = codes[i]; if ((inPtr + unitLen) > end) { // Last -1 to compensate for byte that was read: final int firstCharOffset = byteLen - (end - inPtr) - 1; return _reportTruncatedUTF8InString(byteLen, firstCharOffset, i, unitLen); } switch (unitLen) { case 1: i = ((i & 0x1F) << 6) | (inputBuf[inPtr++] & 0x3F); break; case 2: i = ((i & 0x0F) << 12) | ((inputBuf[inPtr++] & 0x3F) << 6) | (inputBuf[inPtr++] & 0x3F); break; case 3:// trickiest one, need surrogate handling i = ((i & 0x07) << 18) | ((inputBuf[inPtr++] & 0x3F) << 12) | ((inputBuf[inPtr++] & 0x3F) << 6) | (inputBuf[inPtr++] & 0x3F); // note: this is the codepoint value; need to split, too i -= 0x10000; outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); i = 0xDC00 | (i & 0x3FF); break; default: // invalid throw _constructReadException("Invalid byte 0x%02X in short Unicode text block", i); } outBuf[outPtr++] = (char) i; } return _textBuffer.setCurrentAndReturn(outPtr); } private final void _decodeLongAsciiValue() throws IOException { int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); main_loop: while (true) { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int inPtr = _inputPtr; int left = _inputEnd - inPtr; if (outPtr >= outBuf.length) { outBuf = _textBuffer.finishCurrentSegment(); outPtr = 0; } left = Math.min(left, outBuf.length - outPtr); do { byte b = _inputBuffer[inPtr++]; if (b == SmileConstants.BYTE_MARKER_END_OF_STRING) { _inputPtr = inPtr; break main_loop; } outBuf[outPtr++] = (char) b; } while (--left > 0); _inputPtr = inPtr; } _textBuffer.setCurrentLength(outPtr); } private final void _decodeLongUnicodeValue() throws IOException { int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); final int[] codes = SmileConstants.sUtf8UnitLengths; int c; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { // First the tight ASCII loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { _loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outBuf.length) { outBuf = _textBuffer.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outBuf.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (codes[c] != 0) { _inputPtr = ptr; break ascii_loop; } outBuf[outPtr++] = (char) c; } _inputPtr = ptr; } // Ok: end marker, escape or multi-byte? if (c == SmileConstants.INT_MARKER_END_OF_STRING) { break main_loop; } switch (codes[c]) { case 1: // 2-byte UTF c = _decodeUtf8_2(c); break; case 2: // 3-byte UTF if ((_inputEnd - _inputPtr) >= 2) { c = _decodeUtf8_3fast(c); } else { c = _decodeUtf8_3(c); } break; case 3: // 4-byte UTF c = _decodeUtf8_4(c); // Let's add first part right away: outBuf[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outBuf.length) { outBuf = _textBuffer.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; default: // Is this good enough error message? _reportInvalidInitial(c); } // Need more room? if (outPtr >= outBuf.length) { outBuf = _textBuffer.finishCurrentSegment(); outPtr = 0; } // Ok, let's add char to output: outBuf[outPtr++] = (char) c; } _textBuffer.setCurrentLength(outPtr); } /* /********************************************************************** /* Internal methods, secondary Binary data parsing /********************************************************************** */ // Helper method for reading complete binary data value from "raw" // value (regular byte-per-byte) private final byte[] _finishBinaryRaw() throws IOException { int byteLen = _readUnsignedVInt(); // 20-Mar-2021, tatu [dataformats-binary#260]: avoid eager allocation // for very large content if (byteLen > LONGEST_NON_CHUNKED_BINARY) { return _finishBinaryRawLong(byteLen); } // But use simpler, no intermediate buffering, for more compact cases final int expLen = byteLen; final byte[] b = new byte[byteLen]; int ptr = 0; while (byteLen > 0) { if (_inputPtr >= _inputEnd) { if (!_loadMore()) { _reportIncompleteBinaryReadRaw(expLen, ptr); } } int toAdd = Math.min(byteLen, _inputEnd - _inputPtr); System.arraycopy(_inputBuffer, _inputPtr, b, ptr, toAdd); _inputPtr += toAdd; ptr += toAdd; byteLen -= toAdd; } return b; } // @since 2.12.3 protected byte[] _finishBinaryRawLong(final int expLen) throws IOException { int left = expLen; // 20-Mar-2021, tatu: Let's NOT use recycled instance since we have much // longer content and there is likely less benefit of trying to recycle // segments try (final ByteArrayBuilder bb = new ByteArrayBuilder(LONGEST_NON_CHUNKED_BINARY >> 1)) { while (left > 0) { int avail = _inputEnd - _inputPtr; if (avail <= 0) { if (!_loadMore()) { _reportIncompleteBinaryReadRaw(expLen, expLen-left); } avail = _inputEnd - _inputPtr; } int count = Math.min(avail, left); bb.write(_inputBuffer, _inputPtr, count); _inputPtr += count; left -= count; } return bb.toByteArray(); } } // Helper method for reading full contents of a 7-bit (7/8) encoded // binary data chunk: starting with leading leading VInt length indicator // followed by encoded data private final byte[] _read7BitBinaryWithLength() throws IOException { final int byteLen = _readUnsignedVInt(); // 20-Mar-2021, tatu [dataformats-binary#260]: avoid eager allocation // for very large content if (byteLen > LONGEST_NON_CHUNKED_BINARY) { return _finishBinary7BitLong(byteLen); } final byte[] result = new byte[byteLen]; final int lastOkPtr = byteLen - 7; int ptr = 0; // first, read all 7-by-8 byte chunks while (ptr <= lastOkPtr) { if ((_inputEnd - _inputPtr) < 8) { int missing = _tryToLoadToHaveAtLeast(8); if (missing > 0) { _reportIncompleteBinaryRead7Bit(byteLen, ptr); } } int i1 = (_inputBuffer[_inputPtr++] << 25) + (_inputBuffer[_inputPtr++] << 18) + (_inputBuffer[_inputPtr++] << 11) + (_inputBuffer[_inputPtr++] << 4); int x = _inputBuffer[_inputPtr++]; i1 += x >> 3; int i2 = ((x & 0x7) << 21) + (_inputBuffer[_inputPtr++] << 14) + (_inputBuffer[_inputPtr++] << 7) + _inputBuffer[_inputPtr++]; // Ok: got our 7 bytes, just need to split, copy result[ptr++] = (byte)(i1 >> 24); result[ptr++] = (byte)(i1 >> 16); result[ptr++] = (byte)(i1 >> 8); result[ptr++] = (byte)i1; result[ptr++] = (byte)(i2 >> 16); result[ptr++] = (byte)(i2 >> 8); result[ptr++] = (byte)i2; } // and then leftovers: n+1 bytes to decode n bytes int toDecode = (result.length - ptr); if (toDecode > 0) { if ((_inputEnd - _inputPtr) < (toDecode+1)) { int missing = _tryToLoadToHaveAtLeast(toDecode+1); if (missing > 0) { _reportIncompleteBinaryRead7Bit(byteLen, ptr); } } int value = _inputBuffer[_inputPtr++]; for (int i = 1; i < toDecode; ++i) { value = (value << 7) + _inputBuffer[_inputPtr++]; result[ptr++] = (byte) (value >> (7 - i)); } // last byte is different, has remaining 1 - 6 bits, right-aligned value <<= toDecode; result[ptr] = (byte) (value + _inputBuffer[_inputPtr++]); } return result; } // @since 2.12.3 protected byte[] _finishBinary7BitLong(final int expLen) throws IOException { // No need to try to use recycled instance since we have much longer content // and there is likely less benefit of trying to recycle segments try (final ByteArrayBuilder bb = new ByteArrayBuilder(LONGEST_NON_CHUNKED_BINARY >> 1)) { // Decode 1k input chunk at a time final byte[] buffer = new byte[7 * 128]; int left = expLen; int bufPtr = 0; // Main loop for full 7/8 units: while (left >= 7) { if ((_inputEnd - _inputPtr) < 8) { int missing = _tryToLoadToHaveAtLeast(8); if (missing > 0) { _reportIncompleteBinaryRead7Bit(expLen, bb.size() + bufPtr); } } int i1 = (_inputBuffer[_inputPtr++] << 25) + (_inputBuffer[_inputPtr++] << 18) + (_inputBuffer[_inputPtr++] << 11) + (_inputBuffer[_inputPtr++] << 4); int x = _inputBuffer[_inputPtr++]; i1 += x >> 3; int i2 = ((x & 0x7) << 21) + (_inputBuffer[_inputPtr++] << 14) + (_inputBuffer[_inputPtr++] << 7) + _inputBuffer[_inputPtr++]; // Ok: got our 7 bytes, just need to split, copy // NOTE: lgtm cannot deduce the checks but a single bounds check IS enough here buffer[bufPtr++] = (byte)(i1 >> 24); buffer[bufPtr++] = (byte)(i1 >> 16); // lgtm [java/index-out-of-bounds] buffer[bufPtr++] = (byte)(i1 >> 8); // lgtm [java/index-out-of-bounds] buffer[bufPtr++] = (byte)i1; // lgtm [java/index-out-of-bounds] buffer[bufPtr++] = (byte)(i2 >> 16); // lgtm [java/index-out-of-bounds] buffer[bufPtr++] = (byte)(i2 >> 8); // lgtm [java/index-out-of-bounds] buffer[bufPtr++] = (byte)i2; // lgtm [java/index-out-of-bounds] if (bufPtr >= buffer.length) { bb.write(buffer, 0, bufPtr); bufPtr = 0; } left -= 7; } // And then the last one; we know there is room in buffer so: // and then leftovers: n+1 bytes to decode n bytes if (left > 0) { if ((_inputEnd - _inputPtr) < (left+1)) { _loadToHaveAtLeast(left+1); } int value = _inputBuffer[_inputPtr++]; for (int i = 1; i < left; ++i) { value = (value << 7) + _inputBuffer[_inputPtr++]; buffer[bufPtr++] = (byte) (value >> (7 - i)); } // last byte is different, has remaining 1 - 6 bits, right-aligned value <<= left; buffer[bufPtr++] = (byte) (value + _inputBuffer[_inputPtr++]); } if (bufPtr > 0) { bb.write(buffer, 0, bufPtr); } return bb.toByteArray(); } } /* /********************************************************************** /* Internal methods, skipping /********************************************************************** */ /** * Method called to skip remainders of an incomplete token, when * contents themselves will not be needed any more */ protected void _skipIncomplete() throws IOException { _tokenIncomplete = false; int tb = _typeAsInt; switch (tb >> 5) { case 1: // simple literals, numbers tb &= 0x1F; // next 3 bytes define subtype switch (tb >> 2) { case 1: // VInt (zigzag) // easy, just skip until we see sign bit... (should we try to limit damage?) switch (tb & 0x3) { case 1: // vlong _skipBytes(4); // min 5 bytes // fall through case 0: // vint while (true) { final int end = _inputEnd; final byte[] buf = _inputBuffer; while (_inputPtr < end) { if (buf[_inputPtr++] < 0) { return; } } _loadMoreGuaranteed(); } case 2: // big-int // just has binary data _skip7BitBinary(); return; } break; case 2: // other numbers switch (tb & 0x3) { case 0: // float _skipBytes(5); return; case 1: // double _skipBytes(10); return; case 2: // big-decimal // first, skip scale _readUnsignedVInt(); // then length-prefixed binary serialization _skip7BitBinary(); return; } break; } break; case 2: // tiny ASCII // fall through case 3: // short ASCII _skipBytes(1 + (tb & 0x3F)); return; case 4: // tiny unicode // fall through case 5: // short unicode _skipBytes(2 + (tb & 0x3F)); return; case 7: tb &= 0x1F; // next 3 bytes define subtype switch (tb >> 2) { case 0: // long variable length ASCII case 1: // long variable length unicode /* Doesn't matter which one, just need to find the end marker * (note: can potentially skip invalid UTF-8 too) */ while (true) { final int end = _inputEnd; final byte[] buf = _inputBuffer; while (_inputPtr < end) { if (buf[_inputPtr++] == BYTE_MARKER_END_OF_STRING) { return; } } _loadMoreGuaranteed(); } // never gets here case 2: // binary, 7-bit _skip7BitBinary(); return; case 7: // binary, raw _skipBytes(_readUnsignedVInt()); return; } } _throwInternal(); } protected void _skipBytes(int len) throws IOException { // 18-Dec-2023, tatu: Sanity check related to some OSS-Fuzz findings: if (len < 0) { throw _constructReadException("Internal error: _skipBytes() called with negative value: %d", len); } while (true) { int toAdd = Math.min(len, _inputEnd - _inputPtr); _inputPtr += toAdd; len -= toAdd; if (len <= 0) { return; } _loadMoreGuaranteed(); } } /** * Helper method for skipping length-prefixed binary data * section */ protected void _skip7BitBinary() throws IOException { int origBytes = _readUnsignedVInt(); // Ok; 8 encoded bytes for 7 payload bytes first int chunks = origBytes / 7; int encBytes = chunks * 8; // sanity check: not all length markers valid; due to signed int(32) // calculations maximum length only 7/8 of 2^31 if (encBytes < 0) { throw _constructReadException( "Invalid content: invalid 7-bit binary encoded byte length (0x%X) exceeds maximum valid value", origBytes); } // and for last 0 - 6 bytes, last+1 (except none if no leftovers) origBytes -= 7 * chunks; if (origBytes > 0) { encBytes += 1 + origBytes; } _skipBytes(encBytes); } /* /********************************************************** /* Internal methods, UTF8 decoding /********************************************************** */ private final int _decodeUtf8_2(int c) throws IOException { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } return ((c & 0x1F) << 6) | (d & 0x3F); } private final int _decodeUtf8_3(int c1) throws IOException { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); return c; } private final int _decodeUtf8_3fast(int c1) throws IOException { c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); return c; } /** * @return Character value minus 0x10000; this so that caller * can readily expand it to actual surrogates */ private final int _decodeUtf8_4(int c) throws IOException { if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } c = ((c & 0x07) << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { _loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { _reportInvalidOther(d & 0xFF, _inputPtr); } /* note: won't change it to negative here, since caller * already knows it'll need a surrogate */ return ((c << 6) | (d & 0x3F)) - 0x10000; } /* /********************************************************** /* Internal methods, error reporting /********************************************************** */ protected void _reportInvalidSharedName(int index) throws IOException { if (_seenNames == null) { _reportError("Encountered shared name reference, even though document header explicitly declared no shared name references are included"); } _reportError("Invalid shared name reference "+index+"; only got "+_seenNameCount+" names in buffer (invalid content)"); } protected void _reportInvalidSharedStringValue(int index) throws IOException { if (_seenStringValues == null) { _reportError("Encountered shared text value reference, even though document header did not declare shared text value references may be included"); } _reportError("Invalid shared text value reference "+index+"; only got "+_seenStringValueCount+" names in buffer (invalid content)"); } protected void _reportInvalidInitial(int mask) throws JsonParseException { _reportError("Invalid UTF-8 start byte 0x"+Integer.toHexString(mask)); } protected void _reportInvalidOther(int mask) throws JsonParseException { _reportError("Invalid UTF-8 middle byte 0x"+Integer.toHexString(mask)); } protected void _reportInvalidOther(int mask, int ptr) throws JsonParseException { _inputPtr = ptr; _reportInvalidOther(mask); } // @since 2.12.3 protected void _reportIncompleteBinaryReadRaw(int expLen, int actLen) throws IOException { _reportInvalidEOF(String.format( " for Binary value (raw): expected %d bytes, only found %d", expLen, actLen), currentToken()); } // @since 2.12.3 protected void _reportIncompleteBinaryRead7Bit(int expLen, int actLen) throws IOException { // Calculate number of bytes needed (1 encoded byte expresses 7 payload bits): final long encodedLen = (7L + 8L * expLen) / 7L; _reportInvalidEOF(String.format( " for Binary value (7-bit): expected %d payload bytes (from %d encoded), only decoded %d", expLen, encodedLen, actLen), currentToken()); } // @since 2.12.3 protected String _reportTruncatedUTF8InString(int strLenBytes, int truncatedCharOffset, int firstUTFByteValue, int bytesExpected) throws IOException { throw _constructReadException(String.format( "Truncated UTF-8 character in Short Unicode String value (%d bytes): " +"byte 0x%02X at offset #%d indicated %d more bytes needed", strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected)); } protected String _reportTruncatedUTF8InName(int strLenBytes, int truncatedCharOffset, int firstUTFByteValue, int bytesExpected) throws IOException { throw _constructReadException(String.format( "Truncated UTF-8 character in Short Unicode Name (%d bytes): " +"byte 0x%02X at offset #%d indicated %d more bytes needed", strLenBytes, firstUTFByteValue, truncatedCharOffset, bytesExpected)); } /* /********************************************************** /* Internal methods, other /********************************************************** */ private final JsonToken _eofAsNextToken() throws IOException { if (!_streamReadContext.inRoot()) { _handleEOF(); } close(); return _updateTokenToNull(); } private void createChildArrayContext(final int lineNr, final int colNr) throws IOException { _streamReadContext = _streamReadContext.createChildArrayContext(lineNr, colNr); _streamReadConstraints.validateNestingDepth(_streamReadContext.getNestingDepth()); } private void createChildObjectContext(final int lineNr, final int colNr) throws IOException { _streamReadContext = _streamReadContext.createChildObjectContext(lineNr, colNr); _streamReadConstraints.validateNestingDepth(_streamReadContext.getNestingDepth()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy