com.healthmarketscience.jackcess.impl.scsu.Compress Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jackcess Show documentation
Show all versions of jackcess Show documentation
A pure Java library for reading from and writing to MS Access databases.
package com.healthmarketscience.jackcess.impl.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
This class implements a simple compression algorithm
**/
/*
Note on exception handling
This compressor is designed so that it can be restarted after
an exception. All operations advancing input and/or output cursor
(iIn and iOut) either complete an action, or set a state (fUnicodeMode)
before updating the cursors.
*/
public class Compress extends SCSU
{
/** next input character to be read **/
private int iIn;
/** next output byte to be written **/
private int iOut;
/** start index of Unicode mode in output array, or -1 if in single byte mode **/
private int iSCU = -1;
/** true if the next command byte is of the Uxx family */
private boolean fUnicodeMode = false;
/** locate a window for a character given a table of offsets
@param ch - character
@param offsetTable - table of window offsets
@return true if the character fits a window from the table of windows */
private boolean locateWindow(int ch, int[] offsetTable)
{
// always try the current window first
int iWin = getCurrentWindow();
// if the character fits the current window
// just use the current window
if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
{
return true;
}
// try all windows in order
for (iWin = 0; iWin < offsetTable.length; iWin++)
{
if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
{
selectWindow(iWin);
return true;
}
}
// none found
return false;
}
/** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
public static boolean isAsciiCrLfOrTab(int ch)
{
return (ch >= 0x20 && ch <= 0x7F) // ASCII
|| ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB
}
/** output a run of characters in single byte mode
In single byte mode pass through characters in the ASCII range, but
quote characters overlapping with compression command codes. Runs
of characters fitting the current window are output as runs of bytes
in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
Uses and updates the current input and output cursors store in
the instance variables iIn and iOut.
@param in - input character array
@param out - output byte array
@return the next chaacter to be processed. This may be an extended character.
**/
@SuppressWarnings("fallthrough")
public int outputSingleByteRun(char [] in, byte [] out)
throws EndOfOutputException, EndOfInputException, IllegalInputException
{
int iWin = getCurrentWindow();
while(iIn < in.length)
{
int outlen = 0;
byte byte1 = 0;
byte byte2 = 0;
// get the input character
int ch = in[iIn];
int inlen = 1;
// Check input for Surrogate pair
if ( (ch & 0xF800) == 0xD800 )
{
if ( (ch & 0xFC00) == 0xDC00 )
{
// low surrogate out of order
throw new IllegalInputException("Unpaired low surrogate: "+iIn);
}
else
{
// have high surrogate now get low surrogate
if ( iIn >= in.length-1)
{
// premature end of input
throw new EndOfInputException();
}
// get the char
int ch2 = in[iIn+1];
// make sure it's a low surrogate
if ( (ch2 & 0xFC00) != 0xDC00 )
{
// a low surrogate was required
throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
}
// combine the two values
ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
// ch = ch<<10 + ch2 - 0x36F0000;
inlen = 2;
}
}
// ASCII Letter, NUL, CR, LF and TAB are always passed through
if (isAsciiCrLfOrTab(ch) || ch == 0)
{
// pass through directcly
byte2 = (byte)(ch & 0x7F);
outlen = 1;
}
// All other control codes must be quoted
else if (ch < 0x20)
{
byte1 = SQ0;
byte2 = (byte)(ch);
outlen = 2;
}
// Letters that fit the current dynamic window
else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
{
ch -= dynamicOffset[iWin];
byte2 = (byte)(ch | 0x80);
outlen = 1;
}
// check for room in the output array
if (iOut + outlen >= out.length)
{
throw new EndOfOutputException();
}
switch(outlen)
{
default:
// need to use some other compression mode for this
// character so we terminate this loop
return ch; // input not finished
// output the characters
case 2:
out[iOut++] = byte1;
// fall through
case 1:
out[iOut++] = byte2;
break;
}
// advance input pointer
iIn += inlen;
}
return 0; // input all used up
}
/** quote a single character in single byte mode
Quoting a character (aka 'non-locking shift') gives efficient access
to characters that occur in isolation--usually punctuation characters.
When quoting a character from a dynamic window use 0x80 - 0xFF, when
quoting a character from a static window use 0x00-0x7f.
@param ch - character to be quoted
@param out - output byte array
**/
private void quoteSingleByte(int ch, byte [] out)
throws EndOfOutputException
{
Debug.out("Quoting SingleByte ", ch);
int iWin = getCurrentWindow();
// check for room in the output array
if (iOut >= out.length -2)
{
throw new EndOfOutputException();
}
// Output command byte followed by
out[iOut++] = (byte)(SQ0 + iWin);
// Letter that fits the current dynamic window
if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
{
ch -= dynamicOffset[iWin];
out[iOut++] = (byte)(ch | 0x80);
}
// Letter that fits the current static window
else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
{
ch -= staticOffset[iWin];
out[iOut++] = (byte)ch;
}
else
{
throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
}
// advance input pointer
iIn ++;
Debug.out("New input: ", iIn);
}
/** output a run of characters in Unicode mode
A run of Unicode mode consists of characters which are all in the
range of non-compressible characters or isolated occurrence
of any other characters. Characters in the range 0xE00-0xF2FF must
be quoted to avoid overlap with the Unicode mode compression command codes.
Uses and updates the current input and output cursors store in
the instance variables iIn and iOut.
NOTE: Characters from surrogate pairs are passed through and unlike single
byte mode no checks are made for unpaired surrogate characters.
@param in - input character array
@param out - output byte array
@return the next input character to be processed
**/
public char outputUnicodeRun(char [] in, byte [] out)
throws EndOfOutputException
{
// current character
char ch = 0;
while(iIn < in.length)
{
// get current input and set default output length
ch = in[iIn];
int outlen = 2;
// Characters in these ranges could potentially be compressed.
// We require 2 or more compressible characters to break the run
if (isCompressible(ch))
{
// check whether we can look ahead
if( iIn < in.length - 1)
{
// DEBUG
Debug.out("is-comp: ",ch);
char ch2 = in[iIn + 1];
if (isCompressible(ch2))
{
// at least 2 characters are compressible
// break the run
break;
}
//DEBUG
Debug.out("no-comp: ",ch2);
}
// If we get here, the current character is only character
// left in the input or it is followed by a non-compressible
// character. In neither case do we gain by breaking the
// run, so we proceed to output the character.
if (ch >= 0xE000 && ch <= 0xF2FF)
{
// Characters in this range need to be escaped
outlen = 3;
}
}
// check that there is enough room to output the character
if(iOut >= out.length - outlen)
{
// DEBUG
Debug.out("End of Output @", iOut);
// if we got here, we ran out of space in the output array
throw new EndOfOutputException();
}
// output any characters that cannot be compressed,
if (outlen == 3)
{
// output the quote character
out[iOut++] = UQU;
}
// pass the Unicode character in MSB,LSB order
out[iOut++] = (byte)(ch >>> 8);
out[iOut++] = (byte)(ch & 0xFF);
// advance input cursor
iIn++;
}
// return the last character
return ch;
}
static int iNextWindow = 3;
/** redefine a window so it surrounds a given character value
For now, this function uses window 3 exclusively (window 4
for extended windows);
@return true if a window was successfully defined
@param ch - character around which window is positioned
@param out - output byte array
@param fCurUnicodeMode - type of window
**/
private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
throws IllegalInputException, EndOfOutputException
{
int iWin = iNextWindow % 8; // simple LRU
int iPosition = 0;
// iPosition 0 is a reserved value
if (ch < 0x80)
{
throw new IllegalStateException("ch < 0x80");
//return false;
}
// Check the fixed offsets
for (int i = 0; i < fixedOffset.length; i++)
{
if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
{
iPosition = i;
break;
}
}
if (iPosition != 0)
{
// DEBUG
Debug.out("FIXED position is ", iPosition + 0xF9);
// ch fits in a fixed offset window position
dynamicOffset[iWin] = fixedOffset[iPosition];
iPosition += 0xF9;
}
else if (ch < 0x3400)
{
// calculate a window position command and set the offset
iPosition = ch >>> 7;
dynamicOffset[iWin] = ch & 0xFF80;
Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
}
else if (ch < 0xE000)
{
// attempt to place a window where none can go
return false;
}
else if (ch <= 0xFFFF)
{
// calculate a window position command, accounting
// for the gap in position values, and set the offset
iPosition = ((ch - gapOffset)>>> 7);
dynamicOffset[iWin] = ch & 0xFF80;
Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
}
else
{
// if we get here, the character is in the extended range.
// Always use Window 4 to define an extended window
iPosition = (ch - 0x10000) >>> 7;
// DEBUG
Debug.out("Try position Window at ", iPosition);
iPosition |= iWin << 13;
dynamicOffset[iWin] = ch & 0x1FFF80;
}
// Outputting window defintion command for the general cases
if ( iPosition < 0x100 && iOut < out.length-1)
{
out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
out[iOut++] = (byte) (iPosition & 0xFF);
}
// Output an extended window definiton command
else if ( iPosition >= 0x100 && iOut < out.length - 2)
{
Debug.out("Setting extended window at ", iPosition);
out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
out[iOut++] = (byte) (iPosition & 0xFF);
}
else
{
throw new EndOfOutputException();
}
selectWindow(iWin);
iNextWindow++;
return true;
}
/**
compress a Unicode character array with some simplifying assumptions
**/
public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
throws IllegalInputException, EndOfInputException, EndOfOutputException
{
iIn = iStartIn;
iOut = iStartOut;
while (iIn < in.length)
{
int ch;
// previously we switched to a Unicode run
if (iSCU != -1)
{
Debug.out("Remaining", in, iIn);
Debug.out("Output until ["+iOut+"]: ", out);
// output characters as Unicode
ch = outputUnicodeRun(in, out);
// for single character Unicode runs (3 bytes) use quote
if (iOut - iSCU == 3 )
{
// go back and fix up the SCU to an SQU instead
out[iSCU] = SQU;
iSCU = -1;
continue;
}
else
{
iSCU = -1;
fUnicodeMode = true;
}
}
// next, try to output characters as single byte run
else
{
ch = outputSingleByteRun(in, out);
}
// check whether we still have input
if (iIn == in.length)
{
break; // no more input
}
// if we get here, we have a consistent value for ch, whether or
// not it is an regular or extended character. Locate or define a
// Window for the current character
Debug.out("Output so far: ", out);
Debug.out("Routing ch="+ch+" for Input", in, iIn);
// Check that we have enough room to output the command byte
if (iOut >= out.length - 1)
{
throw new EndOfOutputException();
}
// In order to switch away from Unicode mode, it is necessary
// to select (or define) a window. If the characters that follow
// the Unicode range are ASCII characters, we can't use them
// to decide which window to select, since ASCII characters don't
// influence window settings. This loop looks ahead until it finds
// one compressible character that isn't in the ASCII range.
for (int ich = iIn; ch < 0x80; ich++)
{
if (ich == in.length || !isCompressible(in[ich]))
{
// if there are only ASCII characters left,
ch = in[iIn];
break;
}
ch = in[ich]; // lookahead for next non-ASCII char
}
// The character value contained in ch here will only be used to select
// output modes. Actual output of characters starts with in[iIn] and
// only takes place near the top of the loop.
int iprevWindow = getCurrentWindow();
// try to locate a dynamic window
if (ch < 0x80 || locateWindow(ch, dynamicOffset))
{
Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
// lookahead to use SQn instead of SCn for single
// character interruptions of runs in current window
if(!fUnicodeMode && iIn < in.length -1)
{
char ch2 = in[iIn+1];
if (ch2 >= dynamicOffset[iprevWindow] &&
ch2 < dynamicOffset[iprevWindow] + 0x80)
{
quoteSingleByte(ch, out);
selectWindow(iprevWindow);
continue;
}
}
out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
fUnicodeMode = false;
}
// try to locate a static window
else if (!fUnicodeMode && locateWindow(ch, staticOffset))
{
// static windows are not accessible from Unicode mode
Debug.out("located a static window", getCurrentWindow());
quoteSingleByte(ch, out);
selectWindow(iprevWindow); // restore current Window settings
continue;
}
// try to define a window around ch
else if (positionWindow(ch, out, fUnicodeMode) )
{
fUnicodeMode = false;
}
// If all else fails, start a Unicode run
else
{
iSCU = iOut;
out[iOut++] = SCU;
continue;
}
}
return iOut - iStartOut;
}
public byte[] compress(String inStr)
throws IllegalInputException, EndOfInputException
{
// Running out of room for output can cause non-optimal
// compression. In order to not slow down compression too
// much, not all intermediate state is constantly saved.
byte [] out = new byte[inStr.length() * 2];
char [] in = inStr.toCharArray();
//DEBUG
Debug.out("compress input: ",in);
reset();
while(true)
{
try
{
simpleCompress(in, charsRead(), out, bytesWritten());
// if we get here things went fine.
break;
}
catch (EndOfOutputException e)
{
// create a larger output buffer and continue
byte [] largerOut = new byte[out.length * 2];
System.arraycopy(out, 0, largerOut, 0, out.length);
out = largerOut;
}
}
byte [] trimmedOut = new byte[bytesWritten()];
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
out = trimmedOut;
Debug.out("compress output: ", out);
return out;
}
/** reset is only needed to bail out after an exception and
restart with new input */
@Override
public void reset()
{
super.reset();
fUnicodeMode = false;
iSCU = - 1;
}
/** returns the number of bytes written **/
public int bytesWritten()
{
return iOut;
}
/** returns the number of bytes written **/
public int charsRead()
{
return iIn;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy