com.healthmarketscience.jackcess.impl.GeneralLegacyIndexCodes Maven / Gradle / Ivy
/*
Copyright (c) 2008 Health Market Science, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
USA
You can contact Health Market Science at [email protected]
or at the following address:
Health Market Science
2700 Horizon Drive
Suite 200
King of Prussia, PA 19406
*/
package com.healthmarketscience.jackcess.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import static com.healthmarketscience.jackcess.impl.ByteUtil.ByteStream;
/**
* Various constants used for creating "general legacy" (access 2000-2007)
* sort order text index entries.
*
* @author James Ahlborn
*/
public class GeneralLegacyIndexCodes {
static final int MAX_TEXT_INDEX_CHAR_LENGTH =
(JetFormat.TEXT_FIELD_MAX_LENGTH / JetFormat.TEXT_FIELD_UNIT_SIZE);
static final byte END_TEXT = (byte)0x01;
static final byte END_EXTRA_TEXT = (byte)0x00;
// unprintable char is removed from normal text.
// pattern for unprintable chars in the extra bytes:
// 01 01 01 06 )
// = 7 + (4 * char_pos) | 0x8000 (as short)
// = char code
static final int UNPRINTABLE_COUNT_START = 7;
static final int UNPRINTABLE_COUNT_MULTIPLIER = 4;
static final int UNPRINTABLE_OFFSET_FLAGS = 0x8000;
static final byte UNPRINTABLE_MIDFIX = (byte)0x06;
// international char is replaced with ascii char.
// pattern for international chars in the extra bytes:
// [ 02 (for each normal char) ] [ (for each inat char) ]
static final byte INTERNATIONAL_EXTRA_PLACEHOLDER = (byte)0x02;
// see Index.writeCrazyCodes for details on writing crazy codes
static final byte CRAZY_CODE_START = (byte)0x80;
static final byte CRAZY_CODE_1 = (byte)0x02;
static final byte CRAZY_CODE_2 = (byte)0x03;
static final byte[] CRAZY_CODES_SUFFIX =
new byte[]{(byte)0xFF, (byte)0x02, (byte)0x80, (byte)0xFF, (byte)0x80};
static final byte CRAZY_CODES_UNPRINT_SUFFIX = (byte)0xFF;
// stash the codes in some resource files
private static final String CODES_FILE =
DatabaseImpl.RESOURCE_PATH + "index_codes_genleg.txt";
private static final String EXT_CODES_FILE =
DatabaseImpl.RESOURCE_PATH + "index_codes_ext_genleg.txt";
/**
* Enum which classifies the types of char encoding strategies used when
* creating text index entries.
*/
enum Type {
SIMPLE("S") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseSimpleCodes(codeStrings);
}
},
INTERNATIONAL("I") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseInternationalCodes(codeStrings);
}
},
UNPRINTABLE("U") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseUnprintableCodes(codeStrings);
}
},
UNPRINTABLE_EXT("P") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseUnprintableExtCodes(codeStrings);
}
},
INTERNATIONAL_EXT("Z") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return parseInternationalExtCodes(codeStrings);
}
},
IGNORED("X") {
@Override public CharHandler parseCodes(String[] codeStrings) {
return IGNORED_CHAR_HANDLER;
}
};
private final String _prefixCode;
private Type(String prefixCode) {
_prefixCode = prefixCode;
}
public String getPrefixCode() {
return _prefixCode;
}
public abstract CharHandler parseCodes(String[] codeStrings);
}
/**
* Base class for the handlers which hold the text index character encoding
* information.
*/
abstract static class CharHandler {
public abstract Type getType();
public byte[] getInlineBytes() {
return null;
}
public byte[] getExtraBytes() {
return null;
}
public byte[] getUnprintableBytes() {
return null;
}
public byte getExtraByteModifier() {
return 0;
}
public byte getCrazyFlag() {
return 0;
}
}
/**
* CharHandler for Type.SIMPLE
*/
private static final class SimpleCharHandler extends CharHandler {
private byte[] _bytes;
private SimpleCharHandler(byte[] bytes) {
_bytes = bytes;
}
@Override public Type getType() {
return Type.SIMPLE;
}
@Override public byte[] getInlineBytes() {
return _bytes;
}
}
/**
* CharHandler for Type.INTERNATIONAL
*/
private static final class InternationalCharHandler extends CharHandler {
private byte[] _bytes;
private byte[] _extraBytes;
private InternationalCharHandler(byte[] bytes, byte[] extraBytes) {
_bytes = bytes;
_extraBytes = extraBytes;
}
@Override public Type getType() {
return Type.INTERNATIONAL;
}
@Override public byte[] getInlineBytes() {
return _bytes;
}
@Override public byte[] getExtraBytes() {
return _extraBytes;
}
}
/**
* CharHandler for Type.UNPRINTABLE
*/
private static final class UnprintableCharHandler extends CharHandler {
private byte[] _unprintBytes;
private UnprintableCharHandler(byte[] unprintBytes) {
_unprintBytes = unprintBytes;
}
@Override public Type getType() {
return Type.UNPRINTABLE;
}
@Override public byte[] getUnprintableBytes() {
return _unprintBytes;
}
}
/**
* CharHandler for Type.UNPRINTABLE_EXT
*/
private static final class UnprintableExtCharHandler extends CharHandler {
private byte _extraByteMod;
private UnprintableExtCharHandler(Byte extraByteMod) {
_extraByteMod = extraByteMod;
}
@Override public Type getType() {
return Type.UNPRINTABLE_EXT;
}
@Override public byte getExtraByteModifier() {
return _extraByteMod;
}
}
/**
* CharHandler for Type.INTERNATIONAL_EXT
*/
private static final class InternationalExtCharHandler extends CharHandler {
private byte[] _bytes;
private byte[] _extraBytes;
private byte _crazyFlag;
private InternationalExtCharHandler(byte[] bytes, byte[] extraBytes,
byte crazyFlag) {
_bytes = bytes;
_extraBytes = extraBytes;
_crazyFlag = crazyFlag;
}
@Override public Type getType() {
return Type.INTERNATIONAL_EXT;
}
@Override public byte[] getInlineBytes() {
return _bytes;
}
@Override public byte[] getExtraBytes() {
return _extraBytes;
}
@Override public byte getCrazyFlag() {
return _crazyFlag;
}
}
/** shared CharHandler instance for Type.IGNORED */
static final CharHandler IGNORED_CHAR_HANDLER = new CharHandler() {
@Override public Type getType() {
return Type.IGNORED;
}
};
/** alternate shared CharHandler instance for "surrogate" chars (which we do
not handle) */
static final CharHandler SURROGATE_CHAR_HANDLER = new CharHandler() {
@Override public Type getType() {
return Type.IGNORED;
}
@Override public byte[] getInlineBytes() {
throw new IllegalStateException(
"Surrogate pair chars are not handled");
}
};
static final char FIRST_CHAR = (char)0x0000;
static final char LAST_CHAR = (char)0x00FF;
static final char FIRST_EXT_CHAR = LAST_CHAR + 1;
static final char LAST_EXT_CHAR = (char)0xFFFF;
private static final class Codes
{
/** handlers for the first 256 chars. use nested class to lazy load the
handlers */
private static final CharHandler[] _values = loadCodes(
CODES_FILE, FIRST_CHAR, LAST_CHAR);
}
private static final class ExtCodes
{
/** handlers for the rest of the chars in BMP 0. use nested class to
lazy load the handlers */
private static final CharHandler[] _values = loadCodes(
EXT_CODES_FILE, FIRST_EXT_CHAR, LAST_EXT_CHAR);
}
static final GeneralLegacyIndexCodes GEN_LEG_INSTANCE =
new GeneralLegacyIndexCodes();
GeneralLegacyIndexCodes() {
}
/**
* Returns the CharHandler for the given character.
*/
CharHandler getCharHandler(char c)
{
if(c <= LAST_CHAR) {
return Codes._values[c];
}
int extOffset = asUnsignedChar(c) - asUnsignedChar(FIRST_EXT_CHAR);
return ExtCodes._values[extOffset];
}
/**
* Loads the CharHandlers for the given range of characters from the
* resource file with the given name.
*/
static CharHandler[] loadCodes(String codesFilePath,
char firstChar, char lastChar)
{
int numCodes = (asUnsignedChar(lastChar) - asUnsignedChar(firstChar)) + 1;
CharHandler[] values = new CharHandler[numCodes];
Map prefixMap = new HashMap();
for(Type type : Type.values()) {
prefixMap.put(type.getPrefixCode(), type);
}
BufferedReader reader = null;
try {
reader = new BufferedReader(
new InputStreamReader(
DatabaseImpl.getResourceAsStream(codesFilePath), "US-ASCII"));
int start = asUnsignedChar(firstChar);
int end = asUnsignedChar(lastChar);
for(int i = start; i <= end; ++i) {
char c = (char)i;
CharHandler ch = null;
if(Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
// surrogate chars are not included in the codes files
ch = SURROGATE_CHAR_HANDLER;
} else {
String codeLine = reader.readLine();
ch = parseCodes(prefixMap, codeLine);
}
values[(i - start)] = ch;
}
} catch(IOException e) {
throw new RuntimeException("failed loading index codes file " +
codesFilePath, e);
} finally {
ByteUtil.closeQuietly(reader);
}
return values;
}
/**
* Returns a CharHandler parsed from the given line from an index codes
* file.
*/
private static CharHandler parseCodes(Map prefixMap,
String codeLine)
{
String prefix = codeLine.substring(0, 1);
String suffix = ((codeLine.length() > 1) ? codeLine.substring(1) : "");
return prefixMap.get(prefix).parseCodes(suffix.split(",", -1));
}
/**
* Returns a SimpleCharHandler parsed from the given index code strings.
*/
private static CharHandler parseSimpleCodes(String[] codeStrings)
{
if(codeStrings.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new SimpleCharHandler(codesToBytes(codeStrings[0], true));
}
/**
* Returns an InternationalCharHandler parsed from the given index code
* strings.
*/
private static CharHandler parseInternationalCodes(String[] codeStrings)
{
if(codeStrings.length != 2) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new InternationalCharHandler(codesToBytes(codeStrings[0], true),
codesToBytes(codeStrings[1], true));
}
/**
* Returns a UnprintableCharHandler parsed from the given index code
* strings.
*/
private static CharHandler parseUnprintableCodes(String[] codeStrings)
{
if(codeStrings.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new UnprintableCharHandler(codesToBytes(codeStrings[0], true));
}
/**
* Returns a UnprintableExtCharHandler parsed from the given index code
* strings.
*/
private static CharHandler parseUnprintableExtCodes(String[] codeStrings)
{
if(codeStrings.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
byte[] bytes = codesToBytes(codeStrings[0], true);
if(bytes.length != 1) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
return new UnprintableExtCharHandler(bytes[0]);
}
/**
* Returns a InternationalExtCharHandler parsed from the given index code
* strings.
*/
private static CharHandler parseInternationalExtCodes(String[] codeStrings)
{
if(codeStrings.length != 3) {
throw new IllegalStateException("Unexpected code strings " +
Arrays.asList(codeStrings));
}
byte crazyFlag = ("1".equals(codeStrings[2]) ?
CRAZY_CODE_1 : CRAZY_CODE_2);
return new InternationalExtCharHandler(codesToBytes(codeStrings[0], true),
codesToBytes(codeStrings[1], false),
crazyFlag);
}
/**
* Converts a string of hex encoded bytes to a byte[], optionally throwing
* an exception if no codes are given.
*/
private static byte[] codesToBytes(String codes, boolean required)
{
if(codes.length() == 0) {
if(required) {
throw new IllegalStateException("empty code bytes");
}
return null;
}
if((codes.length() % 2) != 0) {
// stripped a leading 0
codes = "0" + codes;
}
byte[] bytes = new byte[codes.length() / 2];
for(int i = 0; i < bytes.length; ++i) {
int charIdx = i*2;
bytes[i] = (byte)(Integer.parseInt(codes.substring(charIdx, charIdx + 2),
16));
}
return bytes;
}
/**
* Returns an the char value converted to an unsigned char value. Note, I
* think this is unnecessary (I think java treats chars as unsigned), but I
* did this just to be on the safe side.
*/
static int asUnsignedChar(char c)
{
return c & 0xFFFF;
}
/**
* Converts an index value for a text column into the entry value (which
* is based on a variety of nifty codes).
*/
void writeNonNullIndexTextValue(
Object value, ByteStream bout, boolean isAscending)
throws IOException
{
// first, convert to string
String str = ColumnImpl.toCharSequence(value).toString();
// all text columns (including memos) are only indexed up to the max
// number of chars in a VARCHAR column
if(str.length() > MAX_TEXT_INDEX_CHAR_LENGTH) {
str = str.substring(0, MAX_TEXT_INDEX_CHAR_LENGTH);
}
// record pprevious entry length so we can do any post-processing
// necessary for this entry (handling descending)
int prevLength = bout.getLength();
// now, convert each character to a "code" of one or more bytes
ExtraCodesStream extraCodes = null;
ByteStream unprintableCodes = null;
ByteStream crazyCodes = null;
int charOffset = 0;
for(int i = 0; i < str.length(); ++i) {
char c = str.charAt(i);
CharHandler ch = getCharHandler(c);
int curCharOffset = charOffset;
byte[] bytes = ch.getInlineBytes();
if(bytes != null) {
// write the "inline" codes immediately
bout.write(bytes);
// only increment the charOffset for chars with inline codes
++charOffset;
}
if(ch.getType() == Type.SIMPLE) {
// common case, skip further code handling
continue;
}
bytes = ch.getExtraBytes();
byte extraCodeModifier = ch.getExtraByteModifier();
if((bytes != null) || (extraCodeModifier != 0)) {
if(extraCodes == null) {
extraCodes = new ExtraCodesStream(str.length());
}
// keep track of the extra codes for later
writeExtraCodes(curCharOffset, bytes, extraCodeModifier, extraCodes);
}
bytes = ch.getUnprintableBytes();
if(bytes != null) {
if(unprintableCodes == null) {
unprintableCodes = new ByteStream();
}
// keep track of the unprintable codes for later
writeUnprintableCodes(curCharOffset, bytes, unprintableCodes,
extraCodes);
}
byte crazyFlag = ch.getCrazyFlag();
if(crazyFlag != 0) {
if(crazyCodes == null) {
crazyCodes = new ByteStream();
}
// keep track of the crazy flags for later
crazyCodes.write(crazyFlag);
}
}
// write end text flag
bout.write(END_TEXT);
boolean hasExtraCodes = trimExtraCodes(
extraCodes, (byte)0, INTERNATIONAL_EXTRA_PLACEHOLDER);
boolean hasUnprintableCodes = (unprintableCodes != null);
boolean hasCrazyCodes = (crazyCodes != null);
if(hasExtraCodes || hasUnprintableCodes || hasCrazyCodes) {
// we write all the international extra bytes first
if(hasExtraCodes) {
extraCodes.writeTo(bout);
}
if(hasCrazyCodes || hasUnprintableCodes) {
// write 2 more end flags
bout.write(END_TEXT);
bout.write(END_TEXT);
// next come the crazy flags
if(hasCrazyCodes) {
writeCrazyCodes(crazyCodes, bout);
// if we are writing unprintable codes after this, tack on another
// code
if(hasUnprintableCodes) {
bout.write(CRAZY_CODES_UNPRINT_SUFFIX);
}
}
// then we write all the unprintable extra bytes
if(hasUnprintableCodes) {
// write another end flag
bout.write(END_TEXT);
unprintableCodes.writeTo(bout);
}
}
}
// handle descending order by inverting the bytes
if(!isAscending) {
// we actually write the end byte before flipping the bytes, and write
// another one after flipping
bout.write(END_EXTRA_TEXT);
// flip the bytes that we have written thus far for this text value
IndexData.flipBytes(bout.getBytes(), prevLength,
(bout.getLength() - prevLength));
}
// write end extra text
bout.write(END_EXTRA_TEXT);
}
/**
* Encodes the given extra code info in the given stream.
*/
private static void writeExtraCodes(
int charOffset, byte[] bytes, byte extraCodeModifier,
ExtraCodesStream extraCodes)
throws IOException
{
// we fill in a placeholder value for any chars w/out extra codes
int numChars = extraCodes.getNumChars();
if(numChars < charOffset) {
int fillChars = charOffset - numChars;
extraCodes.writeFill(fillChars, INTERNATIONAL_EXTRA_PLACEHOLDER);
extraCodes.incrementNumChars(fillChars);
}
if(bytes != null) {
// write the actual extra codes and update the number of chars
extraCodes.write(bytes);
extraCodes.incrementNumChars(1);
} else {
// extra code modifiers modify the existing extra code bytes and do not
// count as additional extra code chars
int lastIdx = extraCodes.getLength() - 1;
if(lastIdx >= 0) {
// the extra code modifier is added to the last extra code written
byte lastByte = extraCodes.get(lastIdx);
lastByte += extraCodeModifier;
extraCodes.set(lastIdx, lastByte);
} else {
// there is no previous extra code, add a new code (but keep track of
// this "unprintable code" prefix)
extraCodes.write(extraCodeModifier);
extraCodes.setUnprintablePrefixLen(1);
}
}
}
/**
* Trims any bytes in the given range off of the end of the given stream,
* returning whether or not there are any bytes left in the given stream
* after trimming.
*/
private static boolean trimExtraCodes(ByteStream extraCodes,
byte minTrimCode, byte maxTrimCode)
throws IOException
{
if(extraCodes == null) {
return false;
}
extraCodes.trimTrailing(minTrimCode, maxTrimCode);
// anything left?
return (extraCodes.getLength() > 0);
}
/**
* Encodes the given unprintable char codes in the given stream.
*/
private static void writeUnprintableCodes(
int charOffset, byte[] bytes, ByteStream unprintableCodes,
ExtraCodesStream extraCodes)
throws IOException
{
// the offset seems to be calculated based on the number of bytes in the
// "extra codes" part of the entry (even if there are no extra codes bytes
// actually written in the final entry).
int unprintCharOffset = charOffset;
if(extraCodes != null) {
// we need to account for some extra codes which have not been written
// yet. additionally, any unprintable bytes added to the beginning of
// the extra codes are ignored.
unprintCharOffset = extraCodes.getLength() +
(charOffset - extraCodes.getNumChars()) -
extraCodes.getUnprintablePrefixLen();
}
// we write a whacky combo of bytes for each unprintable char which
// includes a funky offset and extra char itself
int offset =
(UNPRINTABLE_COUNT_START +
(UNPRINTABLE_COUNT_MULTIPLIER * unprintCharOffset))
| UNPRINTABLE_OFFSET_FLAGS;
// write offset as big-endian short
unprintableCodes.write((offset >> 8) & 0xFF);
unprintableCodes.write(offset & 0xFF);
unprintableCodes.write(UNPRINTABLE_MIDFIX);
unprintableCodes.write(bytes);
}
/**
* Encode the given crazy code bytes into the given byte stream.
*/
private static void writeCrazyCodes(ByteStream crazyCodes, ByteStream bout)
throws IOException
{
// CRAZY_CODE_2 flags at the end are ignored, so ditch them
trimExtraCodes(crazyCodes, CRAZY_CODE_2, CRAZY_CODE_2);
if(crazyCodes.getLength() > 0) {
// the crazy codes get encoded into 6 bit sequences where each code is 2
// bits (where the first 2 bits in the byte are a common prefix).
byte curByte = CRAZY_CODE_START;
int idx = 0;
for(int i = 0; i < crazyCodes.getLength(); ++i) {
byte nextByte = crazyCodes.get(i);
nextByte <<= ((2 - idx) * 2);
curByte |= nextByte;
++idx;
if(idx == 3) {
// write current byte and reset
bout.write(curByte);
curByte = CRAZY_CODE_START;
idx = 0;
}
}
// write last byte
if(idx > 0) {
bout.write(curByte);
}
}
// write crazy code suffix (note, we write this even if all the codes are
// trimmed
bout.write(CRAZY_CODES_SUFFIX);
}
/**
* Extension of ByteStream which keeps track of an additional char count and
* the length of any "unprintable" code prefix.
*/
private static final class ExtraCodesStream extends ByteStream
{
private int _numChars;
private int _unprintablePrefixLen;
private ExtraCodesStream(int length) {
super(length);
}
public int getNumChars() {
return _numChars;
}
public void incrementNumChars(int inc) {
_numChars += inc;
}
public int getUnprintablePrefixLen() {
return _unprintablePrefixLen;
}
public void setUnprintablePrefixLen(int len) {
_unprintablePrefixLen = len;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy