All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.freeutils.charset.gsm.PackedGSMCharset Maven / Gradle / Ivy

The newest version!
/*
 *  Copyright © 2005-2019 Amichai Rothman
 *
 *  This file is part of JCharset - the Java Charset package.
 *
 *  JCharset is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  JCharset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with JCharset.  If not, see .
 *
 *  For additional info see http://www.freeutils.net/source/jcharset/
 */

package net.freeutils.charset.gsm;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

/**
 * The PackedGSMCharset class handles the encoding and decoding of the
 * GSM default encoding charset, with packing as per GSM 03.38 / ETSI TS 123 038 spec.
 * 

* When there are 8*n-1 encoded bytes, there is ambiguity * since it's impossible to distinguish whether the final byte * contains a trailing '@' character (which is mapped to 0) * or 7 zero bits of padding following 7 data bytes. *

* When decoding, we opt for the latter interpretation * since it's far more likely, at the cost of losing a * trailing '@' character in strings whose unpacked size * is a multiple of 8, and whose last character is '@'. *

* An application that wishes to handle this rare case * properly must disambiguate this case externally, such * as by obtaining the original string length, and * appending the trailing '@' if the length * shows that there is one character missing. *

* Alternatively, the spec supports replacing the zero * padding in such a case with a CR character, which is * then removed by the receiver, but is harmless also on * devices that display it as-is since a CR is invisible. * This implementation has configurable support for CR padding. *

* However, this CR padding introduces a new ambiguity, with * a string that really does end with a CR character on an * 8-byte boundary, so in this case an extra CR is appended * to it, and due to the semantics of CR in the spec, a double * CR is equivalent to a single CR, so this is harmless as well. *

* The encoding and decoding are based on the mapping at * http://www.unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT * * @author Amichai Rothman * @since 2007-03-20 */ public class PackedGSMCharset extends GSMCharset { static final int BUFFER_SIZE = 256; static final byte CR = 0x0D; /** * Specifies whether to use CR padding instead of zero padding * when encoding/decoding in order to disambiguate the 7 padding * zero bits in strings whose length is 8*n-1 bytes from a * trailing '@' character in strings of length 8*n. */ final boolean padWithCR; /** * Initializes a new charset with the given canonical name and alias * set, and byte-to-char/char-to-byte lookup tables. * * @param canonicalName the canonical name of this charset * @param aliases an array of this charset's aliases, or null if it has no aliases * @param byteToChar a byte-to-char conversion table for this charset * @param byteToCharEscaped a byte-to-char conversion table for this charset * for the escaped characters * @param charToByte a char-to-byte conversion table for this charset. It can * be generated on-the-fly by calling createInverseLookupTable(byteToChar). * @param charToByteEscaped a char-to-byte conversion table for this charset * for the escaped characters * @param padWithCR specifies whether to apply {@link PackedGSMCharset CR padding} * or the original (but ambiguous) zero padding * @throws java.nio.charset.IllegalCharsetNameException * if the canonical name or any of the aliases are illegal */ protected PackedGSMCharset(String canonicalName, String[] aliases, int[] byteToChar, int[] byteToCharEscaped, int[][] charToByte, int[][] charToByteEscaped, boolean padWithCR) { super(canonicalName, aliases, byteToChar, byteToCharEscaped, charToByte, charToByteEscaped); this.padWithCR = padWithCR; } /** * Constructs a new decoder for this charset. * * @return a new decoder for this charset */ @Override public CharsetDecoder newDecoder() { return new Decoder(this); } /** * Constructs a new encoder for this charset. * * @return a new encoder for this charset */ @Override public CharsetEncoder newEncoder() { return new Encoder(this); } /** * The Encoder inner class handles the encoding of the * Packed GSM default encoding charset. */ protected class Encoder extends GSMCharset.Encoder { int bitpos; byte current; ByteBuffer buf; /** * Constructs an Encoder. * * @param charset the charset that created this encoder */ protected Encoder(Charset charset) { super(charset, 7 / 8f, 2f); buf = ByteBuffer.allocate(BUFFER_SIZE); implReset(); } /** * Resets this encoder, clearing any charset-specific internal state. */ @Override protected void implReset() { bitpos = 0; current = 0; buf.limit(0); } /** * Flushes this encoder. * * @param out the output byte buffer * * @return a coder-result object, either {@link CoderResult#UNDERFLOW} or * {@link CoderResult#OVERFLOW} */ @Override protected CoderResult implFlush(ByteBuffer out) { // flush buffer CoderResult result = pack(buf, out); // handle CR padding if necessary if (padWithCR && bitpos <= 1) { // bitpos is 0 or 1 if (bitpos == 1) { // if the output is 8*n-1 bytes long, the last byte has 7 padding zero // bits which may be ambiguously interpreted as an '@' character, // so in this case we replace the padding with a harmless CR current |= (CR << 1); } else if (out.position() > 0 && out.get(out.position() - 1) >>> 1 == CR) { // if the output is 8*n bytes long and really does end with a CR, // we need to disambiguate this from the CR padding, // so we add an extra CR (due to the spec's definition of CR, // this is equivalent to a single CR and thus also harmless) current = CR; bitpos = 7; } } // flush last (current) partial byte if it exists if (bitpos != 0) { if (!out.hasRemaining()) return CoderResult.OVERFLOW; out.put(current); // write final leftover byte } return result; } /** * Encodes one or more characters into one or more bytes. * * @param in the input character buffer * @param out the output byte buffer * @return a coder-result object describing the reason for termination */ @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { CoderResult result; while (true) { // output buffered data if (buf.hasRemaining()) { result = pack(buf, out); if (result == CoderResult.OVERFLOW) return result; } // process new data into buffer buf.clear(); result = super.encodeLoop(in, buf); buf.flip(); // stop if out of input or error if (!buf.hasRemaining() || result.isError()) return result; } } /** * Packs the given data into full bytes. * * @param in the input byte buffer * @param out the output byte buffer * @return a coder-result object, either {@link CoderResult#UNDERFLOW} or * {@link CoderResult#OVERFLOW} */ protected CoderResult pack(ByteBuffer in, ByteBuffer out) { int remaining = in.remaining(); while (remaining-- > 0) { if (!out.hasRemaining()) return CoderResult.OVERFLOW; byte b = (byte)(in.get() & 0x7F); // remove top bit // assign first group of partial bits current |= b << bitpos; // assign second group of partial bits (if exist) if (bitpos > 0) { // if packed byte is full out.put(current); current = (byte)(b >> (8 - bitpos)); // keep left-over bits (if any) } bitpos = (bitpos + 7) % 8; } return CoderResult.UNDERFLOW; } } /** * The Decoder inner class handles the decoding of the * Packed GSM default encoding charset. */ protected class Decoder extends GSMCharset.Decoder { int bitpos; byte current; byte prev; int unpackedCount; ByteBuffer buf; /** * Constructs a Decoder. * * @param charset the charset that created this decoder */ protected Decoder(Charset charset) { super(charset, 8 / 7f, 2f); buf = ByteBuffer.allocate(BUFFER_SIZE); implReset(); } /** * Resets this decoder, clearing any charset-specific internal state. */ @Override protected void implReset() { bitpos = 0; current = 0; prev = 0; unpackedCount = 0; buf.limit(0); } /** * Flushes this decoder. * * @param out the output character buffer * * @return a coder-result object, either {@link CoderResult#UNDERFLOW} or * {@link CoderResult#OVERFLOW} */ @Override protected CoderResult implFlush(CharBuffer out) { // fix output edge cases caused by ambiguous padding, // depending on the CR padding configuration: // either remove a trailing '@' character if the string length is 8*n, // or remove a trailing CR character if the string length is 8*n // or if the string length is 8*n+1 and it ends with two CR characters int mod = unpackedCount % 8; if (mod <= 1) { // mod is 0 or 1 int pos = out.position() - 1; if (pos > 0) { char c = out.get(pos); if (c == '@' && !padWithCR && mod == 0 || c == CR && padWithCR && (mod == 0 || out.get(pos - 1) == CR)) out.position(pos); // remove last character } } return CoderResult.UNDERFLOW; } /** * Decodes one or more bytes into one or more characters. * * @param in the input byte buffer * @param out the output character buffer * @return a coder-result object describing the reason for termination */ @Override protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { while (true) { // unpack input data into buffer unpackedCount -= buf.remaining(); // it will be counted again after unpack buf.compact(); // move data to beginning and prepare to write more CoderResult unpackResult = unpack(in, buf); buf.flip(); // prepare to read if (!buf.hasRemaining()) return unpackResult; // underflow unpackedCount += buf.remaining(); // decode buffered unpacked data to output CoderResult decodeResult = super.decodeLoop(buf, out); // handle out of output space and buffer still has data in it if (buf.hasRemaining() || decodeResult.isError()) { if (decodeResult.isUnderflow()) { // last byte is escape byte // if there's more input or at least another unpacked byte // (the 8th doesn't require reading from input), just continue if (in.hasRemaining() || unpackResult.isOverflow()) continue; // otherwise we really need more input, so undo the last byte // (escape sequence which was cut in middle) so caller can // properly handle malformed input if there is no more input in.position(in.position() - 1); // unread the byte bitpos = (bitpos + 9) % 8; // undo its unpacking too current = prev; buf.limit(buf.position()); unpackedCount--; } return decodeResult; } } } /** * Unpacks the given data into original bytes. * * @param in the input byte buffer * @param out the output byte buffer * @return a coder-result object, either {@link CoderResult#UNDERFLOW} or * {@link CoderResult#OVERFLOW} */ protected CoderResult unpack(ByteBuffer in, ByteBuffer out) { int remaining = out.remaining(); while (remaining-- > 0) { if (!in.hasRemaining() && bitpos != 1) return CoderResult.UNDERFLOW; if (bitpos == 0) { prev = current; current = in.get(); } // remove top bit and assign first group of partial bits byte b = (byte)(((current & 0xFF) >> bitpos) & 0x7F); // remove top bit and assign second group of partial bits (if exist) if (bitpos >= 2) { prev = current; current = in.get(); b |= (byte)((current << (8 - bitpos)) & 0x7F); } bitpos = (bitpos + 7) % 8; out.put(b); } return CoderResult.OVERFLOW; } } /** * Unpacks the given data into original bytes. *

* This is an external utility method and is not used * internally by the Charset implementation. * * @param in the input bytes * @return the unpacked output bytes */ public static byte[] unpack(byte[] in) { byte[] out = new byte[(in.length * 8) / 7]; int len = out.length; int current = 0; int bitpos = 0; for (int i = 0; i < len; i++) { // remove top bit and assign first group of partial bits out[i] = (byte)(((in[current] & 0xFF) >> bitpos) & 0x7F); // remove top bit and assign second group of partial bits (if exist) if (bitpos > 1) out[i] |= (byte)((in[++current] << (8 - bitpos)) & 0x7F); else if (bitpos == 1) current++; bitpos = (bitpos + 7) % 8; } // this fixes an ambiguity bug in the specs // where the last of 8 packed bytes is 0 // and it's impossible to distinguish whether it is a // trailing '@' character (which is mapped to 0) // or extra zero-bit padding for 7 actual data bytes. // // we opt for the latter, since it's far more likely, // at the cost of losing a trailing '@' character // in strings whose unpacked size modulo 8 is 0, // and whose last character is '@'. // // an application that wishes to handle this rare case // properly must disambiguate this case externally, such // as by obtaining the original string length, and // appending the trailing '@' if the length // shows that there is one character missing. if (len % 8 == 0 && len > 0 && out[len - 1] == 0) { byte[] fixed = new byte[len - 1]; System.arraycopy(out, 0, fixed, 0, len - 1); out = fixed; } return out; } /** * Packs the given data into full bytes. *

* This is an external utility method and is not used * internally by the Charset implementation. * * @param in the input bytes * @return the packed output bytes */ public static byte[] pack(byte[] in) { byte[] out = new byte[(int)Math.ceil((in.length * 7) / 8f)]; int current = 0; int bitpos = 0; for (byte b : in) { b &= 0x7F; // remove top bit // assign first group of partial bits out[current] |= b << bitpos; // assign second group of partial bits (if exist) if (bitpos > 1) out[++current] |= b >> 8 - bitpos; else if (bitpos == 1) // packed byte is full (but no left-over bits) current++; bitpos = (bitpos + 7) % 8; } return out; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy