All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.codehaus.jackson.smile.SmileConstants Maven / Gradle / Ivy

Go to download

Support for efficient binary data format that can be access same as textual JSON ("binary infoset for JSON"), using standard Jackson abstractions: specifically, extension of JsonFactory and implementations of JsonParser and JsonGenerator.

There is a newer version: 1.9.13
Show newest version
package org.codehaus.jackson.smile;

/**
 * Constants used by {@link SmileGenerator} and {@link SmileParser}
 * 
 * @author tatu
 */
public final class SmileConstants
{
    /*
    /**********************************************************
    /* Thresholds
    /**********************************************************
     */

    /**
     * Encoding has special "short" forms for value Strings that can
     * be represented by 64 bytes of UTF-8 or less.
     */
    public final static int MAX_SHORT_VALUE_STRING_BYTES = 64;

    /**
     * Encoding has special "short" forms for field names that can
     * be represented by 64 bytes of UTF-8 or less.
     */
    public final static int MAX_SHORT_NAME_ASCII_BYTES = 64;

    /**
     * Maximum byte length for short non-ASCII names is slightly
     * less due to having to reserve bytes 0xF8 and above (but
     * we get one more as values 0 and 1 are not valid)
     */
    public final static int MAX_SHORT_NAME_UNICODE_BYTES = 56;
    
    /**
     * Longest back reference we use for field names is 10 bits; no point
     * in keeping much more around
     */
    public final static int MAX_SHARED_NAMES = 1024;

    /**
     * Longest back reference we use for short shared String values is 10 bits,
     * so up to (1 << 10) values to keep track of.
     */
    public final static int MAX_SHARED_STRING_VALUES = 1024;

    /**
     * Also: whereas we can refer to names of any length, we will only consider
     * text values that are considered "tiny" or "short" (ones encoded with
     * length prefix); this value thereby has to be maximum length of Strings
     * that can be encoded as such.
     */
    public final static int MAX_SHARED_STRING_LENGTH_BYTES = 65;
    
    /**
     * And to make encoding logic tight and simple, we can always
     * require that output buffer has this amount of space
     * available before encoding possibly short String (3 bytes since
     * longest UTF-8 encoded Java char is 3 bytes).
     * Two extra bytes need to be reserved as well; first for token indicator,
     * and second for terminating null byte (in case it's not a short String after all)
     */
    public final static int MIN_BUFFER_FOR_POSSIBLE_SHORT_STRING = 1 + (3 * 65);

    /*
    /**********************************************************
    /* Byte markers
    /**********************************************************
     */
    
    /**
     * We need a byte marker to denote end of variable-length Strings. Although
     * null byte is commonly used, let's try to avoid using it since it can't
     * be embedded in Web Sockets content (similarly, 0xFF can't). There are
     * multiple candidates for bytes UTF-8 can not have; 0xFC is chosen to
     * allow reasonable ordering (highest values meaning most significant
     * framing function; 0xFF being end-of-content and so on)
     */
    public final static int INT_MARKER_END_OF_STRING = 0xFC;

    public final static byte BYTE_MARKER_END_OF_STRING = (byte) INT_MARKER_END_OF_STRING;
    
    /**
     * In addition we can use a marker to allow simple framing; splitting
     * of physical data (like file) into distinct logical sections like
     * JSON documents. 0xFF makes sense here since it is also used
     * as end marker for Web Sockets.
     */
    public final static byte BYTE_MARKER_END_OF_CONTENT = (byte) 0xFF;

    /*
    /**********************************************************
    /* Format header: put smile on your data...
    /**********************************************************
     */

    /**
     * First byte of data header
     */
    public final static byte HEADER_BYTE_1 = (byte) ':';

    /**
     * Second byte of data header
     */
    public final static byte HEADER_BYTE_2 = (byte) ')';

    /**
     * Third byte of data header
     */
    public final static byte HEADER_BYTE_3 = (byte) '\n';

    /**
     * Current version consists of four zero bits (nibble)
     */
    public final static int HEADER_VERSION_0 = 0x0;

    /**
     * Fourth byte of data header; contains version nibble, may
     * have flags
     */
    public final static byte HEADER_BYTE_4 = (HEADER_VERSION_0 << 4);
    
    /**
     * Indicator bit that indicates whether encoded content may 
     * have Shared names (back references to recently encoded field
     * names). If no header available, must be
     * processed as if this was set to true.
     * If (and only if) header exists, and value is 0, can parser
     * omit storing of seen names, as it is guaranteed that no back
     * references exist.
     */
    public final static int HEADER_BIT_HAS_SHARED_NAMES = 0x01;

    /**
     * Indicator bit that indicates whether encoded content may
     * have shared String values (back references to recently encoded
     * 'short' String values, where short is defined as 64 bytes or less).
     * If no header available, can be assumed to be 0 (false).
     * If header exists, and bit value is 1, parsers has to store up
     * to 1024 most recently seen distinct short String values.
     */
    public final static int HEADER_BIT_HAS_SHARED_STRING_VALUES = 0x02;

    /**
     * Indicator bit that indicates whether encoded content may
     * contain raw (unquoted) binary values.
     * If no header available, can be assumed to be 0 (false).
     * If header exists, and bit value is 1, parser can not assume that
     * specific byte values always have default meaning (specifically,
     * content end marker 0xFF and header signature can be contained
     * in binary values)
     *

* Note that this bit being true does not automatically mean that * such raw binary content indeed exists; just that it may exist. * This because header is written before any binary data may be * written. */ public final static int HEADER_BIT_HAS_RAW_BINARY = 0x04; /* /********************************************************** /* Type prefixes: 3 MSB of token byte /********************************************************** */ // Shared strings are back references for last 63 short (< 64 byte) string values // NOTE: 0x00 is reserved, not used with current version (may be used in future) public final static int TOKEN_PREFIX_SHARED_STRING_SHORT = 0x00; // literals are put between 0x20 and 0x3F to reserve markers (smiley), along with ints/doubles //public final static int TOKEN_PREFIX_MISC_NUMBERS = 0x20; public final static int TOKEN_PREFIX_TINY_ASCII = 0x40; public final static int TOKEN_PREFIX_SMALL_ASCII = 0x60; public final static int TOKEN_PREFIX_TINY_UNICODE = 0x80; public final static int TOKEN_PREFIX_SHORT_UNICODE = 0xA0; // Small ints are 4-bit (-16 to +15) integer constants public final static int TOKEN_PREFIX_SMALL_INT = 0xC0; // And misc types have empty at the end too, to reserve 0xF8 - 0xFF public final static int TOKEN_PREFIX_MISC_OTHER = 0xE0; /* /********************************************************** /* Token literals, normal mode /********************************************************** */ // First, non-structured literals public final static byte TOKEN_LITERAL_EMPTY_STRING = 0x20; public final static byte TOKEN_LITERAL_NULL = 0x21; public final static byte TOKEN_LITERAL_FALSE = 0x22; public final static byte TOKEN_LITERAL_TRUE = 0x23; // And then structured literals public final static byte TOKEN_LITERAL_START_ARRAY = (byte) 0xF8; public final static byte TOKEN_LITERAL_END_ARRAY = (byte) 0xF9; public final static byte TOKEN_LITERAL_START_OBJECT = (byte) 0xFA; public final static byte TOKEN_LITERAL_END_OBJECT = (byte) 0xFB; /* /********************************************************** /* Subtype constants for misc text/binary types /********************************************************** */ /** * Type (for misc, other) used * for regular integral types (byte/short/int/long) */ public final static int TOKEN_MISC_INTEGER = 0x24; /** * Type (for misc, other) used * for regular floating-point types (float, double) */ public final static int TOKEN_MISC_FP = 0x28; /** * Type (for misc, other) used for * variable length UTF-8 encoded text, when it is known to only contain ASCII chars. * Note: 2 LSB are reserved for future use; must be zeroes for now */ public final static int TOKEN_MISC_LONG_TEXT_ASCII = 0xE0; /** * Type (for misc, other) used * for variable length UTF-8 encoded text, when it is NOT known to only contain ASCII chars * (which means it MAY have multi-byte characters) * Note: 2 LSB are reserved for future use; must be zeroes for now */ public final static int TOKEN_MISC_LONG_TEXT_UNICODE = 0xE4; /** * Type (for misc, other) used * for "safe" (encoded by only using 7 LSB, giving 8/7 expansion ratio). * This is usually done to ensure that certain bytes are never included * in encoded data (like 0xFF) * Note: 2 LSB are reserved for future use; must be zeroes for now */ public final static int TOKEN_MISC_BINARY_7BIT = 0xE8; /** * Type (for misc, other) used for shared String values where index * does not fit in "short" reference range (which is 0 - 30). If so, * 2 LSB from here and full following byte are used to get 10-bit * index. Values */ public final static int TOKEN_MISC_SHARED_STRING_LONG = 0xEC; /** * Raw binary data marker is specifically chosen as separate from * other types, since it can have significant impact on framing * (or rather fast scanning based on structure and framing markers). */ public final static int TOKEN_MISC_BINARY_RAW = 0xFD; /* /********************************************************** /* Modifiers for numeric entries /********************************************************** */ /** * Numeric subtype (2 LSB) for {@link #TOKEN_MISC_INTEGER}, * indicating 32-bit integer (int) */ public final static int TOKEN_MISC_INTEGER_32 = 0x00; /** * Numeric subtype (2 LSB) for {@link #TOKEN_MISC_INTEGER}, * indicating 32-bit integer (long) */ public final static int TOKEN_MISC_INTEGER_64 = 0x01; /** * Numeric subtype (2 LSB) for {@link #TOKEN_MISC_INTEGER}, * indicating {@link java.math.BigInteger} type. */ public final static int TOKEN_MISC_INTEGER_BIG = 0x02; // Note: type 3 (0xF3) reserved for future use /** * Numeric subtype (2 LSB) for {@link #TOKEN_MISC_FP}, * indicating 32-bit IEEE single precision floating point number. */ public final static int TOKEN_MISC_FLOAT_32 = 0x00; /** * Numeric subtype (2 LSB) for {@link #TOKEN_MISC_FP}, * indicating 64-bit IEEE double precision floating point number. */ public final static int TOKEN_MISC_FLOAT_64 = 0x01; /** * Numeric subtype (2 LSB) for {@link #TOKEN_MISC_FP}, * indicating {@link java.math.BigDecimal} type. */ public final static int TOKEN_MISC_FLOAT_BIG = 0x02; // Note: type 3 (0xF7) reserved for future use /* /********************************************************** /* Token types for keys /********************************************************** */ /** * Let's use same code for empty key as for empty String value */ public final static byte TOKEN_KEY_EMPTY_STRING = 0x20; public final static int TOKEN_PREFIX_KEY_SHARED_LONG = 0x30; public final static byte TOKEN_KEY_LONG_STRING = 0x34; public final static int TOKEN_PREFIX_KEY_SHARED_SHORT = 0x40; public final static int TOKEN_PREFIX_KEY_ASCII = 0x80; public final static int TOKEN_PREFIX_KEY_UNICODE = 0xC0; /* /********************************************************** /* Basic UTF-8 decode/encode table /********************************************************** */ /** * Additionally we can combine UTF-8 decoding info into similar * data table. * Values indicate "byte length - 1"; meaning -1 is used for * invalid bytes, 0 for single-byte codes, 1 for 2-byte codes * and 2 for 3-byte codes. */ public final static int[] sUtf8UnitLengths; static { int[] table = new int[256]; for (int c = 128; c < 256; ++c) { int code; // We'll add number of bytes needed for decoding if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) code = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) code = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... code = 3; } else { // And -1 seems like a good "universal" error marker... code = -1; } table[c] = code; } sUtf8UnitLengths = table; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy