/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Apr 30, 2007
*/
package com.bigdata.btree.keys;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Locale;
import java.util.Properties;
import java.util.UUID;
import com.bigdata.btree.keys.KeyBuilder.Options;
import com.bigdata.io.IManagedByteArray;
import com.bigdata.util.BytesUtil;
/**
*
* Interface for building up variable unsigned byte[]
keys from
* one or more primitive data types values and/or Unicode strings. An instance
* of this interface may be {@link #reset()} and reused to encode a series of
* keys.
*
*
* A sort key is an unsigned byte[] that preserves the total order of the
* original data. Sort keys may potentially be formed from multiple fields but
* field markers do not appear within the resulting sort key. While the original
* values can be extracted from sort keys (this is true of all the fixed length
* fields, such as int, long, float, or double) they can not be extracted from
* Unicode variable length fields (the collation ordering for a Unicode string
* depends on the {@link Locale}, the collation strength, and the decomposition
* mode and is a non-reversable operation).
*
* Unicode
*
* Factory methods are defined by {@link KeyBuilder} for obtaining instances of
* this interface that optionally support Unicode. Instances may be created for
* a given {@link Locale}, collation strength, decomposition mode, etc.
*
*
* The ICU library supports generation of compressed Unicode sort keys and is
* used by default when available. The JDK {@link java.text} package also
* supports the generation of Unicode sort keys, but it does NOT produce
* compressed sort keys. The resulting sort keys are therefore (a) incompatible
* with those produced by the ICU library and (b) much larger than those
* produced by the ICU library.
*
*
* Support for Unicode MAY be disabled using {@link Options#COLLATOR}, by using
* {@link KeyBuilder#newInstance()} or another factory method that does not
* enable Unicode support, or by using one of the {@link KeyBuilder}
* constructors that does not support Unicode.
*
* Multi-field keys with variable length fields
*
* Multi-field keys in which variable length fields are embedded within the key
* present a special problem. Any run of fixed length fields can be compared as
* unsigned byte[]s. Likewise, any any key with a fixed length prefix (including
* zero) but a variable length field in its tail can also be compared directly
* as unsigned byte[]s. However, the introduction of a variable length field
* into any non-terminal position in a multi-field key must be handled specially
* since simple concatenation of the field keys will NOT produce the correct
* total ordering. (This is why SQL requires that text fields compare as if they
* were padded out with ASCII blanks (0x20) to some maximum length for the
* field.) A utility method exists specifically for this purpose - see
* {@link #appendText(String, boolean, boolean)}.
*
*
* @see KeyBuilder#newInstance()
* @see KeyBuilder#newUnicodeInstance()
* @see KeyBuilder#newUnicodeInstance(Properties)
* @see SuccessorUtil
*
* @author Bryan Thompson
* @version $Id$
*/
public interface IKeyBuilder extends ISortKeyBuilder, IManagedByteArray {
/**
* The backing byte[] WILL be transparently replaced if the buffer capacity
* is extended. {@inheritDoc}
*/
byte[] array();
/**
* The offset of the slice into the backing byte[] is always zero.
* {@inheritDoc}
*/
int off();
/**
* The length of the slice is number of bytes written onto the backing
* byte[]. This is set to ZERO (0) by {@link #reset()}. {@inheritDoc}
*/
int len();
/**
* Return the encoded key. Comparison of keys returned by this method MUST
* treat the array as an array of unsigned bytes .
*
* Note that keys are donated to the btree so it is important to
* allocate new keys when running in the same process space. When using a
* network api, the api provides the necessary decoupling.
*
* @return A new array containing the key.
*
* @see BytesUtil#compareBytes(byte[], byte[])
*/
public byte[] getKey();
/**
* An alias for {@link #getKey()}.
*
* {@inheritDoc}
*/
public byte[] toByteArray();
/**
* Reset the key length to zero before building another key.
*
* @return this
*/
public IKeyBuilder reset();
/*
* Optional operations.
*/
/**
* Encodes a Unicode string using the configured {@link Options#COLLATOR}
* and appends the resulting sort key to the buffer (without a trailing nul
* byte).
*
* Note: The {@link SuccessorUtil#successor(String)} of a string is formed
* by appending a trailing nul
character. However, since
* IDENTICAL
appears to be required to differentiate between
* a string and its successor (with the trailing nul
* character), you MUST form the sort key first and then its successor (by
* appending a trailing nul
). Failure to follow this pattern
* will lead to the successor of the key comparing as EQUAL to the key. For
* example,
*
*
*
* IKeyBuilder keyBuilder = ...;
*
* String s = "foo";
*
* byte[] fromKey = keyBuilder.reset().append( s );
*
* // right.
* byte[] toKey = keyBuilder.reset().append( s ).appendNul();
*
* // wrong!
* byte[] toKey = keyBuilder.reset().append( s+"\0" );
*
*
*
* @param s
* A string.
*
* @throws UnsupportedOperationException
* if Unicode is not supported.
*
* @return this
*
* @see SuccessorUtil#successor(String)
* @see SuccessorUtil#successor(byte[])
* @see TestICUUnicodeKeyBuilder#test_keyBuilder_unicode_trailingNuls()
*
* FIXME update the javadoc further to speak to handling of multi-field
* keys.
*
* @todo provide a more flexible interface for handling Unicode, including
* the means to encode using a specified language family (such as
* could be identified with an xml:lang
attribute).
*/
public IKeyBuilder append(String s);
/**
* Encodes a variable length text field into the buffer. The text is
* truncated to {@link IKeyBuilder#maxlen} characters. The sort keys for
* strings that differ after truncation solely in the #of trailing
* {@link #pad} characters will be identical (trailing pad characters are
* implicit out to {@link #maxlen} characters).
*
* Note: Trailing pad characters are normalized to a representation as a
* single pad character (1 byte) followed by the #of actual or implied
* trailing pad characters represented as an unsigned short integer (2
* bytes). This technique serves to keep multi-field keys with embedded
* variable length text fields aligned such that the field following a
* variable length text field does not bleed into the lexiographic ordering
* of the variable length text field.
*
* Note: While the ASCII encoding happens to use one byte for each character
* that is NOT true of the Unicode encoding. The space requirements for the
* Unicode encoding depend on the text, the Locale, the collator strength,
* and the collator decomposition mode.
*
* Note: The successor option is designed to encapsulate some
* trickiness around forming the successor of a variable length text field
* embedded in a multi-field key. In particular, simply appending a
* nul
byte will NOT work (it works fine when the text field
* is the last field in the key or when it is the only component in the
* key). This approach breaks encapsulation of the field boundaries such
* that the resulting "successor" is actually ordered before the original
* key. This happens because you introduce a 0x0 byte right on the boundary
* of the next field, effectively causing the next field to have a smaller
* value. Consider the following example (in hex) where "|" represents the
* end of the "text" field:
*
*
* ab cd | 12
*
*
* if you compute the successor by appending a nul byte to the text field
* you get
*
*
* ab cd | 00 12
*
*
* which is ordered before the original key!
*
* @param text
* The text.
* @param unicode
* When true the text is interpreted as Unicode according to the
* {@link Options#COLLATOR} option. Otherwise it is interpreted
* as ASCII.
* @param successor
* When true, the successor of the text will be encoded.
* Otherwise the text will be encoded.
*
* @return The {@link IKeyBuilder}.
*
* @see http://www.unicode.org/reports/tr10/tr10-10.html#Interleaved_Levels
*/
public IKeyBuilder appendText(String text, boolean unicode,
boolean successor);
/*
* Note: This operation is not implemented since it can cause confusion so
* easily. If you want Unicode encoding use append(String). If you want
* ASCII encoding, use appendASCII(String).
*/
// /**
// * Encodes a character as a Unicode sort key by first converting it to a
// * unicode string of length N and then encoding it using
// * {@link #append(String)} (optional operation).
// *
// * @throws UnsupportedOperationException
// * if Unicode is not supported.
// *
// * @return this
// */
// public IKeyBuilder append(char[] v);
/*
* Required operations.
*/
/**
* Return true
iff Unicode is supported by this object
* (returns false
if only ASCII support is configured).
*/
public boolean isUnicodeSupported();
/**
* The maximum length of a variable length text field is 65535
(pow(2,16)-1
).
*
* Note: This restriction only applies to multi-field keys where the text
* field appears in a non-terminal position within the key - that is as encoded by . When a text
* field appears in such a non-terminal position trailing pad characters are
* used to maintain lexiographic ordering over the multi-field key.
*/
final public int maxlen = 65535;
/**
* Encodes a unicode string by assuming that its contents are ASCII
* characters. For each character, this method simply chops of the high byte
* and converts the low byte to an unsigned byte.
*
* Note: This method is potentially much faster than the Unicode aware
* {@link #append(String)}. However, this method is NOT unicode aware and
* non-ASCII characters will not be encoded correctly. This method MUST NOT
* be mixed with keys whose corresponding component is encoded by the
* unicode aware methods, e.g., {@link #append(String)}.
*
* @param s
* A String containing US-ASCII characters.
*
* @return this
*/
public IKeyBuilder appendASCII(String s);
/**
* Appends a byte - the byte is treated as an unsigned
value.
*
* @param b
* The byte.
*
* @return this
*/
public IKeyBuilder append(byte b);
/**
* Appends an array of bytes - the bytes are treated as
* unsigned
values.
*
* @param a
* The array of bytes.
*
* @return this
*/
public IKeyBuilder append(byte[] a);
/**
* Append len bytes starting at off in a to the key
* buffer - the bytes are treated as unsigned
values.
*
* @param off
* The offset.
* @param len
* The #of bytes to append.
* @param a
* The array containing the bytes to append.
*
* @return this
*/
public IKeyBuilder append(byte[] a, int off, int len);
/**
* Appends a double precision floating point value by first converting it
* into a signed long integer using {@link Double#doubleToLongBits(double)},
* converting that values into a twos-complement number and then appending
* the bytes in big-endian order into the key buffer.
*
* Note: this converts -0d and +0d to the same key.
*
* @param d
* The double-precision floating point value.
*
* @return this
*/
public IKeyBuilder append(double d);
/**
* Appends a single precision floating point value by first converting it
* into a signed integer using {@link Float#floatToIntBits(float)}
* converting that values into a twos-complement number and then appending
* the bytes in big-endian order into the key buffer.
*
* Note: this converts -0f and +0f to the same key.
*
* @param f
* The single-precision floating point value.
*
* @return this
*/
public IKeyBuilder append(float f);
/**
* Appends the UUID to the key using the MSB and then the LSB (this
* preserves the natural order imposed by {@link UUID#compareTo(UUID)}).
*
* @param uuid
* The UUID.
*
* @return this
*/
public IKeyBuilder append(UUID uuid);
/**
* Appends a signed long integer to the key by first converting it to a
* lexiographic ordering as an unsigned long integer and then appending it
* into the buffer as 8 bytes using a big-endian order.
*
* @return this
*/
public IKeyBuilder append(long v);
/**
* Appends a signed integer to the key by first converting it to a
* lexiographic ordering as an unsigned integer and then appending it into
* the buffer as 4 bytes using a big-endian order.
*
* @return this
*/
public IKeyBuilder append(int v);
/**
* Appends a signed short integer to the key by first converting it to a
* two-complete representation supporting unsigned byte[] comparison and
* then appending it into the buffer as 2 bytes using a big-endian order.
*
* @return this
*/
public IKeyBuilder append(short v);
/*
* Note: this method has been dropped from the API to reduce the
* possibility of confusion. If you want Unicode semantics then use
* append(String). If you want ASCII semantics then use appendASCII().
* If you want signed integer semantics then use append(short).
*/
// /**
// * Encodes a character as a 16-bit unsigned integer.
// *
// * Note: Characters are encoded as unsigned integers rather than as Unicode
// * values since the semantics of Unicode collation sequences often violate
// * the semantics of the character code points, even for ASCII. For example,
// * the character 'z' has the successor '{', but Unicode collation would
// * place order the string "{" BEFORE the string "z".
// *
// * @param v
// * The character.
// *
// * @return this
// */
// public IKeyBuilder append(char v);
/**
* Converts the signed byte to an unsigned byte and appends it to the key.
*
* @param v
* The signed byte.
*
* @return this
*/
public IKeyBuilder appendSigned(final byte v);
/**
* Append an unsigned zero byte to the key.
*
* @return this
*/
public IKeyBuilder appendNul();
/**
* Encode a {@link BigInteger} into an unsigned byte[] and append it into
* the key buffer.
*
* The encoding is a 2 byte run length whose leading bit is set iff the
* {@link BigInteger} is negative followed by the byte[]
as
* returned by {@link BigInteger#toByteArray()}.
*
* @param The
* {@link BigInteger} value.
*
* @return The unsigned byte[].
*/
public IKeyBuilder append(final BigInteger i);
/**
* Encode a {@link BigDecimal} into an unsigned byte[] and append it into
* the key buffer.
*
* @param The
* {@link BigDecimal} value.
*
* @return The unsigned byte[].
*/
public IKeyBuilder append(final BigDecimal d);
/**
* Append the value to the buffer, encoding it as appropriate based on the
* class of the object. This method handles all of the primitive data types
* plus {@link UUID} and Unicode {@link String}s.
*
* @param val
* The value.
*
* @return this
*
* @throws IllegalArgumentException
* if val is null
.
* @throws UnsupportedOperationException
* if val is an instance of an unsupported class.
*/
public IKeyBuilder append(Object val);
/**
* Converts the key into a z-order byte array, assuming numDimensions components
* of type Long (i.e., 64bit each). For instance, assume the current key's
* buffer is 001001011010010001010100 and we call the method with
* numDimensions=3. The method logically proceeds as follows:
*
* 1. Split the key into n components, namely: 00100101 10100100 01010100
* 2. Merge the component bit by bit: 010 001 110 001 000 111 000 100
* 3. The result is this merged array
*
* @param baseSize
* @param numDimensions
*/
byte[] toZOrder(int numDimensions);
/**
* Inverts method above in the sense that it interprets the buffer as
* a zOrderString and returns an array of long values of size numDimensions,
* reflecting the individual components of the z-order string.
*
* @param size
* @param numDimensions
* @return
*/
long[] fromZOrder(int numDimensions);
}