com.bigdata.rdf.internal.IVUnicode Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Oct 5, 2011
*/
package com.bigdata.rdf.internal;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Comparator;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.io.compression.IUnicodeCompressor;
import com.bigdata.io.compression.NoCompressor;
import com.bigdata.util.Bytes;
/**
* Utility class supporting {@link IV}s having inline Unicode data.
*
* IVs must be able to report their correct mutual order. This means that the
* Java {@link String} must be given the same order as the encoded Unicode
* representation. Since we must include the #of bytes in the {@link IV}
* representation, this means that we wind up with a length prefix followed by
* some representation of the character data. This can not be consistent with
* the code point ordering imposed by {@link String#compareTo(String)}.
* Therefore, the {@link IVUnicodeComparator} is used to make the ordering over
* the {@link String} data consistent with the encoded representation of that
* data.
*
* Note: This is not the only way to solve the problem. We could also have
* generated the encoded representation from any {@link IV} having inline
* Unicode data each time we need to compare two {@link IV}s, but that could
* turn into a lot of overhead.
*
* Note: This does not attempt to make the Unicode representation "tight" and is
* not intended to handle very large Unicode strings. Large Unicode data in the
* statement indices causes them to bloat and has a negative impact on the
* overall system performance. The use case for inline Unicode data is when the
* data are small enough that they are worth inserting into the statement
* indices rather than indirecting through the TERM2ID/ID2TERM indices. Large
* RDF Values should always be inserted into the BLOBS index which is designed
* for that purpose.
*
* @author Bryan Thompson
* @version $Id$
*
* TODO This is directly persisting char[] data. Is that portable?
*/
public class IVUnicode {
/**
* Helper instance for compression/decompression of Unicode string data.
*/
// private static IUnicodeCompressor uc = new BOCU1Compressor();
// private static IUnicodeCompressor uc = new SCSUCompressor();
private static IUnicodeCompressor uc = new NoCompressor();
// /**
// * Helper instance for compression/decompression of Unicode string data.
// */
// private static UnicodeHelper un = new UnicodeHelper(uc);
/**
* Encode a Unicode string.
*
* @param s
* The string.
*
* @return The encoded byte[].
*/
static public byte[] encode1(final String s) {
// return un.encode1(s);
// /*
// * Inlined code from UnicodeHelper.encode1().
// */
// final ByteArrayBuffer tmp = new ByteArrayBuffer(s.length());
//
// final int nwritten = uc.encode(s, tmp);
//
// final int npack = LongPacker.getByteLength(nwritten);
//
// final byte[] a = new byte[npack+nwritten];
//
// final DataOutputBuffer dob = new DataOutputBuffer(0/* len */, a/* buf */);
// try {
// // write the length of the (prefix+compressed) data.
// dob.packLong(nwritten);
//
// // copy the compressed data.
// dob.append(tmp.array(), 0/* off */, tmp.pos());
//
// // return the array, including the encode length prefix.
// return a;
// } finally {
// try {
// dob.close();
// } catch (IOException e) {
// throw new RuntimeException(e);
// }
// }
/*
* Dead simple encode.
*
* Note: This places a 32k byte length limit on inline Unicode. That's
* probably quite Ok as large Unicode values have an extreme negative
* impact on the statement indices are really belong in the BLOBS index.
*/
final int slen = s.length();
if (slen > Short.MAX_VALUE)
throw new UnsupportedOperationException();
final int capacity = Bytes.SIZEOF_SHORT + (slen << 1);
final IKeyBuilder k = new KeyBuilder(capacity);
k.append((short) slen);
for (int i = 0; i < slen; i++) {
final char ch = s.charAt(i);
final short sh = (short)ch;
k.append(sh);
}
assert k.array().length == capacity;
return k.array();
}
/**
* Decode a {@link String} from the input stream. The result is appended
* into the caller's buffer. The caller is responsible for resetting the
* buffer as necessary.
*
* @param in
* The input stream.
* @param out
* The output buffer.
*
* @throws IOException
*/
static int decode(final InputStream in, final StringBuilder out)
throws IOException {
/*
* Dead simple decode.
*
* FIXME If this works, then change the API to pass in a byte[]. That
* API will be better aligned for the IV decode use case since we always
* have the byte[].
*/
final DataInputStream dis = new DataInputStream(in);
final int slen = decodeShort(dis.readShort());
assert slen <= Short.MAX_VALUE : slen;
assert slen >= 0 : slen;
out.ensureCapacity(slen);
// final char[] a = new char[slen];
for (int i = 0; i < slen; i++) {
final short unsignedShort = decodeShort(dis.readShort());
final char ch = (char)unsignedShort;
out.append(ch);
}
// final String s = new String(a);
// out.append(s);
return Bytes.SIZEOF_SHORT + (slen<<1);
// return un.decode(in, out);
/*
* Inlined code from UnicodeHelper.decode().
*/
// // read in the byte length of the encoded character data.
// final long n = LongPacker.unpackLong(in);
//
// if (n > Integer.MAX_VALUE) {
// // Java does not support strings longer than int32 characters.
// throw new IOException();
// }
//
// // The byte length of the backed long value.
// int ndecoded = LongPacker.getByteLength(n);
//
// // ensure sufficient capacity for (at least) that many chars.
// out.ensureCapacity((int) n + out.length());
//
// // decode into the temporary buffer.
// ndecoded += uc.decode(new SliceInputStream(in, (int) n), out);
//
// return ndecoded;
}
private static short decodeShort(final short signedShort) {
final short slen;
{ // Per XSDUnsignedShortIV
int v = signedShort;
if (v < 0) {
v = v + 0x8000;
} else {
v = v - 0x8000;
}
slen = (short) v;//(v & 0xffff);
}
return slen;
}
/**
* Return the byte length of the serialized representation of a unicode
* string.
*
* @param s
* The string.
*
* @return Its byte length.
*/
public static int byteLengthUnicode(final String s) {
return Bytes.SIZEOF_INT + s.length();
/*
* Note: This does all the work to compress something and then returns
* you the length. Caller's need to be smart about how and when they
* find the byte length of an IV representing some inlined Unicode data
* since we basically have to serialize the String to find the length.
*/
// try {
// return un.encode(s, new NullOutputStream(), new ByteArrayBuffer(s
// .length()));
// } catch (IOException ex) {
// throw new RuntimeException(ex);
// }
}
/**
* Class imposes the natural ordering of the encoded Unicode representation
* for an {@link IV} having inline Unicode data on Java {@link String}s.
* This is used by such {@link IV}s in order to impose on themselves the
* correct natural order.
*
* @author Bryan
* Thompson
*/
public static class IVUnicodeComparator implements Comparator {
public static final IVUnicodeComparator INSTANCE = new IVUnicodeComparator();
private IVUnicodeComparator() {
}
@Override
public int compare(final String o1, final String o2) {
final int len1 = o1.length();
final int len2 = o2.length();
int ret = len1 < len2 ? -1 : len1 > len2 ? 1 : 0;
if (ret == 0) {
// Only compare strings which have the same length.
ret = o1.compareTo(o2);
}
return ret;
}
}
}