All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.icu.text.UnicodeDecompressor Maven / Gradle / Ivy

Go to download

International Component for Unicode for Java (ICU4J) is a mature, widely used Java library providing Unicode and Globalization support

There is a newer version: 76.1
Show newest version
/*
 *******************************************************************************
 * Copyright (C) 1996-2009, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */

package com.ibm.icu.text;

/**
* A decompression engine implementing the Standard Compression Scheme
* for Unicode (SCSU) as outlined in Unicode Technical
* Report #6.
*
* 

USAGE

* *

The static methods on UnicodeDecompressor may be used in a * straightforward manner to decompress simple strings:

* *
*  byte [] compressed = ... ; // get compressed bytes from somewhere
*  String result = UnicodeDecompressor.decompress(compressed);
* 
* *

The static methods have a fairly large memory footprint. * For finer-grained control over memory usage, * UnicodeDecompressor offers more powerful APIs allowing * iterative decompression:

* *
*  // Decompress an array "bytes" of length "len" using a buffer of 512 chars
*  // to the Writer "out"
*
*  UnicodeDecompressor myDecompressor         = new UnicodeDecompressor();
*  final static int    BUFSIZE                = 512;
*  char []             charBuffer             = new char [ BUFSIZE ];
*  int                 charsWritten           = 0;
*  int []              bytesRead              = new int [1];
*  int                 totalBytesDecompressed = 0;
*  int                 totalCharsWritten      = 0;
*
*  do {
*    // do the decompression
*    charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed, 
*                                             len, bytesRead,
*                                             charBuffer, 0, BUFSIZE);
*
*    // do something with the current set of chars
*    out.write(charBuffer, 0, charsWritten);
*
*    // update the no. of bytes decompressed
*    totalBytesDecompressed += bytesRead[0];
*
*    // update the no. of chars written
*    totalCharsWritten += charsWritten;
*
*  } while(totalBytesDecompressed < len);
*
*  myDecompressor.reset(); // reuse decompressor
* 
* *

Decompression is performed according to the standard set forth in * Unicode Technical * Report #6

* * @see UnicodeCompressor * * @author Stephen F. Booth * @stable ICU 2.4 */ public final class UnicodeDecompressor implements SCSU { //========================== // Instance variables //========================== /** Alias to current dynamic window */ private int fCurrentWindow = 0; /** Dynamic compression window offsets */ private int [] fOffsets = new int [ NUMWINDOWS ]; /** Current compression mode */ private int fMode = SINGLEBYTEMODE; /** Size of our internal buffer */ private final static int BUFSIZE = 3; /** Internal buffer for saving state */ private byte [] fBuffer = new byte [BUFSIZE]; /** Number of characters in our internal buffer */ private int fBufferLength = 0; /** * Create a UnicodeDecompressor. * Sets all windows to their default values. * @see #reset * @stable ICU 2.4 */ public UnicodeDecompressor(){ reset(); // initialize to defaults } /** * Decompress a byte array into a String. * @param buffer The byte array to decompress. * @return A String containing the decompressed characters. * @see #decompress(byte [], int, int) * @stable ICU 2.4 */ public static String decompress(byte [] buffer){ char [] buf = decompress(buffer, 0, buffer.length); return new String(buf); } /** * Decompress a byte array into a Unicode character array. * @param buffer The byte array to decompress. * @param start The start of the byte run to decompress. * @param limit The limit of the byte run to decompress. * @return A character array containing the decompressed bytes. * @see #decompress(byte []) * @stable ICU 2.4 */ public static char [] decompress(byte [] buffer, int start, int limit) { UnicodeDecompressor comp = new UnicodeDecompressor(); // use a buffer we know will never overflow // in the worst case, each byte will decompress // to a surrogate pair (buffer must be at least 2 chars) int len = Math.max(2, 2 * (limit - start)); char [] temp = new char [len]; int charCount = comp.decompress(buffer, start, limit, null, temp, 0, len); char [] result = new char [charCount]; System.arraycopy(temp, 0, result, 0, charCount); return result; } /** * Decompress a byte array into a Unicode character array. * * This function will either completely fill the output buffer, * or consume the entire input. * * @param byteBuffer The byte buffer to decompress. * @param byteBufferStart The start of the byte run to decompress. * @param byteBufferLimit The limit of the byte run to decompress. * @param bytesRead A one-element array. If not null, on return * the number of bytes read from byteBuffer. * @param charBuffer A buffer to receive the decompressed data. * This buffer must be at minimum two characters in size. * @param charBufferStart The starting offset to which to write * decompressed data. * @param charBufferLimit The limiting offset for writing * decompressed data. * @return The number of Unicode characters written to charBuffer. * @stable ICU 2.4 */ public int decompress(byte [] byteBuffer, int byteBufferStart, int byteBufferLimit, int [] bytesRead, char [] charBuffer, int charBufferStart, int charBufferLimit) { // the current position in the source byte buffer int bytePos = byteBufferStart; // the current position in the target char buffer int ucPos = charBufferStart; // the current byte from the source buffer int aByte = 0x00; // charBuffer must be at least 2 chars in size if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2) throw new IllegalArgumentException("charBuffer.length < 2"); // if our internal buffer isn't empty, flush its contents // to the output buffer before doing any more decompression if(fBufferLength > 0) { int newBytes = 0; // fill the buffer completely, to guarantee one full character if(fBufferLength != BUFSIZE) { newBytes = fBuffer.length - fBufferLength; // verify there are newBytes bytes in byteBuffer if(byteBufferLimit - byteBufferStart < newBytes) newBytes = byteBufferLimit - byteBufferStart; System.arraycopy(byteBuffer, byteBufferStart, fBuffer, fBufferLength, newBytes); } // reset buffer length to 0 before recursive call fBufferLength = 0; // call self recursively to decompress the buffer int count = decompress(fBuffer, 0, fBuffer.length, null, charBuffer, charBufferStart, charBufferLimit); // update the positions into the arrays ucPos += count; bytePos += newBytes; } // the main decompression loop mainLoop: while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { switch(fMode) { case SINGLEBYTEMODE: // single-byte mode decompression loop singleByteModeLoop: while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { aByte = byteBuffer[bytePos++] & 0xFF; switch(aByte) { // All bytes from 0x80 through 0xFF are remapped // to chars or surrogate pairs according to the // currently active window case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2: case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0: case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF: case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4: case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9: case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8: case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED: case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2: case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7: case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC: case 0xFD: case 0xFE: case 0xFF: // For offsets <= 0xFFFF, convert to a single char // by adding the window's offset and subtracting // the generic compression offset if(fOffsets[ fCurrentWindow ] <= 0xFFFF) { charBuffer[ucPos++] = (char) (aByte + fOffsets[ fCurrentWindow ] - COMPRESSIONOFFSET); } // For offsets > 0x10000, convert to a surrogate pair by // normBase = window's offset - 0x10000 // high surr. = 0xD800 + (normBase >> 10) // low surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F) else { // make sure there is enough room to write // both characters // if not, save state and break out if((ucPos + 1) >= charBufferLimit) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } int normalizedBase = fOffsets[ fCurrentWindow ] - 0x10000; charBuffer[ucPos++] = (char) (0xD800 + (normalizedBase >> 10)); charBuffer[ucPos++] = (char) (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F)); } break; // bytes from 0x20 through 0x7F are treated as ASCII and // are remapped to chars by padding the high byte // (this is the same as quoting from static window 0) // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D) // are treated as ASCII as well case 0x00: case 0x09: case 0x0A: case 0x0D: case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27: case 0x28: case 0x29: case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37: case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B: case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77: case 0x78: case 0x79: case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E: case 0x7F: charBuffer[ucPos++] = (char) aByte; break; // quote unicode case SQUOTEU: // verify we have two bytes following tag // if not, save state and break out if( (bytePos + 1) >= byteBufferLimit ) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } aByte = byteBuffer[bytePos++]; charBuffer[ucPos++] = (char) (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); break; // switch to Unicode mode case SCHANGEU: fMode = UNICODEMODE; break singleByteModeLoop; //break; // handle all quote tags case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3: case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7: // verify there is a byte following the tag // if not, save state and break out if(bytePos >= byteBufferLimit) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } // if the byte is in the range 0x00 - 0x7F, use // static window n otherwise, use dynamic window n int dByte = byteBuffer[bytePos++] & 0xFF; charBuffer[ucPos++] = (char) (dByte+ (dByte >= 0x00 && dByte < 0x80 ? sOffsets[aByte - SQUOTE0] : (fOffsets[aByte - SQUOTE0] - COMPRESSIONOFFSET))); break; // handle all change tags case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3: case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7: fCurrentWindow = aByte - SCHANGE0; break; // handle all define tags case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3: case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7: // verify there is a byte following the tag // if not, save state and break out if(bytePos >= byteBufferLimit) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } fCurrentWindow = aByte - SDEFINE0; fOffsets[fCurrentWindow] = sOffsetTable[byteBuffer[bytePos++] & 0xFF]; break; // handle define extended tag case SDEFINEX: // verify we have two bytes following tag // if not, save state and break out if((bytePos + 1) >= byteBufferLimit ) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } aByte = byteBuffer[bytePos++] & 0xFF; fCurrentWindow = (aByte & 0xE0) >> 5; fOffsets[fCurrentWindow] = 0x10000 + (0x80 * (((aByte & 0x1F) << 8) | (byteBuffer[bytePos++] & 0xFF))); break; // reserved, shouldn't happen case SRESERVED: break; } // end switch } // end while break; case UNICODEMODE: // unicode mode decompression loop unicodeModeLoop: while(bytePos < byteBufferLimit && ucPos < charBufferLimit) { aByte = byteBuffer[bytePos++] & 0xFF; switch(aByte) { // handle all define tags case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3: case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7: // verify there is a byte following tag // if not, save state and break out if(bytePos >= byteBufferLimit ) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } fCurrentWindow = aByte - UDEFINE0; fOffsets[fCurrentWindow] = sOffsetTable[byteBuffer[bytePos++] & 0xFF]; fMode = SINGLEBYTEMODE; break unicodeModeLoop; //break; // handle define extended tag case UDEFINEX: // verify we have two bytes following tag // if not, save state and break out if((bytePos + 1) >= byteBufferLimit ) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } aByte = byteBuffer[bytePos++] & 0xFF; fCurrentWindow = (aByte & 0xE0) >> 5; fOffsets[fCurrentWindow] = 0x10000 + (0x80 * (((aByte & 0x1F) << 8) | (byteBuffer[bytePos++] & 0xFF))); fMode = SINGLEBYTEMODE; break unicodeModeLoop; //break; // handle all change tags case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3: case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7: fCurrentWindow = aByte - UCHANGE0; fMode = SINGLEBYTEMODE; break unicodeModeLoop; //break; // quote unicode case UQUOTEU: // verify we have two bytes following tag // if not, save state and break out if(bytePos >= byteBufferLimit - 1) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } aByte = byteBuffer[bytePos++]; charBuffer[ucPos++] = (char) (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); break; default: // verify there is a byte following tag // if not, save state and break out if(bytePos >= byteBufferLimit ) { --bytePos; System.arraycopy(byteBuffer, bytePos, fBuffer, 0, byteBufferLimit - bytePos); fBufferLength = byteBufferLimit - bytePos; bytePos += fBufferLength; break mainLoop; } charBuffer[ucPos++] = (char) (aByte << 8 | (byteBuffer[bytePos++] & 0xFF)); break; } // end switch } // end while break; } // end switch( fMode ) } // end while // fill in output parameter if(bytesRead != null) bytesRead [0] = (bytePos - byteBufferStart); // return # of chars written return (ucPos - charBufferStart); } /** * Reset the decompressor to its initial state. * @stable ICU 2.4 */ public void reset() { // reset dynamic windows fOffsets[0] = 0x0080; // Latin-1 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A fOffsets[2] = 0x0400; // Cyrillic fOffsets[3] = 0x0600; // Arabic fOffsets[4] = 0x0900; // Devanagari fOffsets[5] = 0x3040; // Hiragana fOffsets[6] = 0x30A0; // Katakana fOffsets[7] = 0xFF00; // Fullwidth ASCII fCurrentWindow = 0; // Make current window Latin-1 fMode = SINGLEBYTEMODE; // Always start in single-byte mode fBufferLength = 0; // Empty buffer } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy