com.cloudhopper.commons.charset.GSMCharset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ch-commons-charset Show documentation
Show all versions of ch-commons-charset Show documentation
Utility classes for converting between charsets (mostly "mobile" in nature) such as Unicode to GSM and vice versa
package com.cloudhopper.commons.charset;
/*
* #%L
* ch-commons-charset
* %%
* Copyright (C) 2012 Cloudhopper by Twitter
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.cloudhopper.commons.util.FastByteArrayOutputStream;
import java.io.IOException;
/**
* This class encodes and decodes Java Strings to and from the SMS default
* alphabet. It also supports the default extension table. The default alphabet
* and it's extension table is defined in GSM 03.38.
*
* @author joelauer (twitter: @jjlauer or http://twitter.com/jjlauer)
*/
public class GSMCharset extends BaseCharset {
public static final int EXTENDED_ESCAPE = 0x1b;
/** Page break (extended table). */
public static final int PAGE_BREAK = 0x0a;
public static final char[] CHAR_TABLE = {
'@', '\u00a3', '$', '\u00a5', '\u00e8', '\u00e9', '\u00f9', '\u00ec',
'\u00f2', '\u00c7', '\n', '\u00d8', '\u00f8', '\r', '\u00c5', '\u00e5',
'\u0394', '_', '\u03a6', '\u0393', '\u039b', '\u03a9', '\u03a0', '\u03a8',
'\u03a3', '\u0398', '\u039e', ' ', '\u00c6', '\u00e6', '\u00df', '\u00c9', // 0x1B is actually an escape which we'll encode to a space char
' ', '!', '"', '#', '\u00a4', '%', '&', '\'',
'(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', '<', '=', '>', '?',
'\u00a1', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', '\u00c4', '\u00d6', '\u00d1', '\u00dc', '\u00a7',
'\u00bf', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', '\u00e4', '\u00f6', '\u00f1', '\u00fc', '\u00e0',
};
/**
* Extended character table. Characters in this table are accessed by the
* 'escape' character in the base table. It is important that none of the
* 'inactive' characters ever be matchable with a valid base-table
* character as this breaks the encoding loop.
*
* @see #EXTENDED_ESCAPE
*/
public static final char[] EXT_CHAR_TABLE = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, '^', 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
'{', '}', 0, 0, 0, 0, 0, '\\',
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, '[', '~', ']', 0,
'|', 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, '\u20ac', 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
};
/**
* Verifies that this charset can represent every character in the Java
* String (char sequence).
* @param str0 The String to verify
* @return True if the charset can represent every character in the Java
* String, otherwise false.
*/
static public boolean canRepresent(CharSequence str0) {
if (str0 == null) {
return true;
}
int len = str0.length();
for (int i = 0; i < len; i++) {
// get the char in this string
char c = str0.charAt(i);
// simple range checks for most common characters (0x20 -> 0x5F) or (0x61 -> 0x7E)
if ((c >= ' ' && c <= '_') || (c >= 'a' && c <= '~')) {
continue;
} else {
// 10X more efficient using a switch statement vs. a lookup table search
switch (c) {
case '\u00A3': // £
case '\u00A5': // ¥
case '\u00E8': // è
case '\u00E9': // é
case '\u00F9': // ù
case '\u00EC': // ì
case '\u00F2': // ò
case '\u00C7': // Ç
case '\n': // newline
case '\u00D8': // Ø
case '\u00F8': // ø
case '\r': // carriage return
case '\u00C5': // Å
case '\u00E5': // å
case '\u0394': // Δ
case '\u03A6': // Φ
case '\u0393': // Γ
case '\u039B': // Λ
case '\u03A9': // Ω
case '\u03A0': // Π
case '\u03A8': // Ψ
case '\u03A3': // Σ
case '\u0398': // Θ
case '\u039E': // Ξ
case '\u00C6': // Æ
case '\u00E6': // æ
case '\u00DF': // ß
case '\u00C9': // É
case '\u00A4': // ¤
case '\u00A1': // ¡
case '\u00C4': // Ä
case '\u00D6': // Ö
case '\u00D1': // Ñ
case '\u00DC': // Ü
case '\u00A7': // §
case '\u00BF': // ¿
case '\u00E4': // ä
case '\u00F6': // ö
case '\u00F1': // ñ
case '\u00FC': // ü
case '\u00E0': // à
case '\u20AC': // €
continue;
default:
return false;
}
}
}
return true;
}
@Override
public int estimateEncodeByteLength(CharSequence str0) {
if (str0 == null) {
return 0;
}
// only a couple chars are expected to be "double" bytes
return str0.length() + 10;
}
@Override
public byte[] encode(CharSequence str0) {
if (str0 == null) {
return null;
}
// estimate the length of the dynamic byte array
int estimatedByteLength = estimateEncodeByteLength(str0);
FastByteArrayOutputStream baos = new FastByteArrayOutputStream(estimatedByteLength);
try {
int len = str0.length();
for (int i = 0; i < len; i++) {
int search = 0;
char c = str0.charAt(i);
for (; search < CHAR_TABLE.length; search++) {
if (search == EXTENDED_ESCAPE) {
continue;
}
if (c == CHAR_TABLE[search]) {
baos.write(search);
break;
}
if (c == EXT_CHAR_TABLE[search]) {
baos.write(EXTENDED_ESCAPE);
baos.write(search);
break;
}
}
if (search == CHAR_TABLE.length) {
// A '?' character.
baos.write(0x3f);
}
}
} catch (IOException e) {
// should be an impossible error
throw new RuntimeException("Impossible error with FastByteArrayOutputStream: " + e.getMessage(), e);
}
return baos.toByteArray();
}
@Override
public int estimateDecodeCharLength(byte[] bytes) {
if (bytes == null) {
return 0;
}
if (bytes.length < 2) {
return bytes.length;
}
// only a couple chars are expected to be "double" bytes
return bytes.length + 10;
}
/**
* Decode an SMS default alphabet-encoded octet string into a Java String.
*/
@Override
public void decode(byte[] bytes, StringBuilder buffer) {
if (bytes == null) {
// append nothing
return;
}
char[] table = CHAR_TABLE;
for (int i = 0; i < bytes.length; i++) {
int code = (int)bytes[i] & 0x000000ff;
if (code == EXTENDED_ESCAPE) {
// take next char from extension table
table = EXT_CHAR_TABLE;
} else {
buffer.append((code >= table.length) ? '?' : table[code]);
// go back to the default table
table = CHAR_TABLE;
}
}
}
}