com.amazon.redshift.core.Encoding Maven / Gradle / Ivy
/*
* Copyright (c) 2003, PostgreSQL Global Development Group
* See the LICENSE file in the project root for more information.
*/
package com.amazon.redshift.core;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.HashMap;
import com.amazon.redshift.logger.LogLevel;
import com.amazon.redshift.logger.RedshiftLogger;
/**
* Representation of a particular character encoding.
*/
public class Encoding {
protected RedshiftLogger logger;
private static final Encoding DEFAULT_ENCODING = new Encoding();
/*
* Preferred JVM encodings for backend encodings.
*/
private static final HashMap encodings = new HashMap();
static {
//Note: this list should match the set of supported server
// encodings found in backend/util/mb/encnames.c
encodings.put("SQL_ASCII", new String[]{"ASCII", "US-ASCII"});
encodings.put("UNICODE", new String[]{"UTF-8", "UTF8"});
encodings.put("UTF8", new String[]{"UTF-8", "UTF8"});
encodings.put("LATIN1", new String[]{"ISO8859_1"});
encodings.put("LATIN2", new String[]{"ISO8859_2"});
encodings.put("LATIN3", new String[]{"ISO8859_3"});
encodings.put("LATIN4", new String[]{"ISO8859_4"});
encodings.put("ISO_8859_5", new String[]{"ISO8859_5"});
encodings.put("ISO_8859_6", new String[]{"ISO8859_6"});
encodings.put("ISO_8859_7", new String[]{"ISO8859_7"});
encodings.put("ISO_8859_8", new String[]{"ISO8859_8"});
encodings.put("LATIN5", new String[]{"ISO8859_9"});
encodings.put("LATIN7", new String[]{"ISO8859_13"});
encodings.put("LATIN9", new String[]{"ISO8859_15_FDIS"});
encodings.put("EUC_JP", new String[]{"EUC_JP"});
encodings.put("EUC_CN", new String[]{"EUC_CN"});
encodings.put("EUC_KR", new String[]{"EUC_KR"});
encodings.put("JOHAB", new String[]{"Johab"});
encodings.put("EUC_TW", new String[]{"EUC_TW"});
encodings.put("SJIS", new String[]{"MS932", "SJIS"});
encodings.put("BIG5", new String[]{"Big5", "MS950", "Cp950"});
encodings.put("GBK", new String[]{"GBK", "MS936"});
encodings.put("UHC", new String[]{"MS949", "Cp949", "Cp949C"});
encodings.put("TCVN", new String[]{"Cp1258"});
encodings.put("WIN1256", new String[]{"Cp1256"});
encodings.put("WIN1250", new String[]{"Cp1250"});
encodings.put("WIN874", new String[]{"MS874", "Cp874"});
encodings.put("WIN", new String[]{"Cp1251"});
encodings.put("ALT", new String[]{"Cp866"});
// We prefer KOI8-U, since it is a superset of KOI8-R.
encodings.put("KOI8", new String[]{"KOI8_U", "KOI8_R"});
// If the database isn't encoding-aware then we can't have
// any preferred encodings.
encodings.put("UNKNOWN", new String[0]);
// The following encodings do not have a java equivalent
encodings.put("MULE_INTERNAL", new String[0]);
encodings.put("LATIN6", new String[0]);
encodings.put("LATIN8", new String[0]);
encodings.put("LATIN10", new String[0]);
}
private interface UTFEncodingProvider {
Encoding getEncoding();
}
private static final UTFEncodingProvider UTF_ENCODING_PROVIDER;
static {
//for java 1.8 and older, use implementation optimized for char[]
final JavaVersion runtimeVersion = JavaVersion.getRuntimeVersion();
if (JavaVersion.v1_8.compareTo(runtimeVersion) >= 0) {
UTF_ENCODING_PROVIDER = new UTFEncodingProvider() {
@Override
public Encoding getEncoding() {
return new CharOptimizedUTF8Encoder();
}
};
} else {
//for newer versions, use default java behavior
UTF_ENCODING_PROVIDER = new UTFEncodingProvider() {
@Override
public Encoding getEncoding() {
return new ByteOptimizedUTF8Encoder();
}
};
}
}
private final Charset encoding;
private final boolean fastASCIINumbers;
/**
* Uses the default charset of the JVM.
*/
private Encoding() {
this(Charset.defaultCharset(), RedshiftLogger.getDriverLogger());
}
/**
* Subclasses may use this constructor if they know in advance of their ASCII number
* compatibility.
*
* @param encoding charset to use
* @param logger the logger to log the entry for debugging.
* @param fastASCIINumbers whether this encoding is compatible with ASCII numbers.
*/
protected Encoding(Charset encoding, boolean fastASCIINumbers, RedshiftLogger logger) {
this.logger = logger;
if (encoding == null) {
throw new NullPointerException("Null encoding charset not supported");
}
this.encoding = encoding;
this.fastASCIINumbers = fastASCIINumbers;
if (RedshiftLogger.isEnable()) {
this.logger.log(LogLevel.DEBUG, "Creating new Encoding {0} with fastASCIINumbers {1}",
new Object[]{encoding, fastASCIINumbers});
}
}
/**
* Use the charset passed as parameter and tests at creation time whether the specified encoding
* is compatible with ASCII numbers.
*
* @param encoding charset to use
* @param logger the logger to log the entry for debugging.
*/
protected Encoding(Charset encoding, RedshiftLogger logger) {
this(encoding, testAsciiNumbers(encoding), logger);
}
/**
* Returns true if this encoding has characters '-' and '0'..'9' in exactly same posision as
* ascii.
*
* @return true if the bytes can be scanned directly for ascii numbers.
*/
public boolean hasAsciiNumbers() {
return fastASCIINumbers;
}
/**
* Construct an Encoding for a given JVM encoding.
*
* @param jvmEncoding the name of the JVM encoding
* @param logger the logger to log the entry for debugging.
* @return an Encoding instance for the specified encoding, or an Encoding instance for the
* default JVM encoding if the specified encoding is unavailable.
*/
public static Encoding getJVMEncoding(String jvmEncoding, RedshiftLogger logger) {
if ("UTF-8".equals(jvmEncoding)) {
return UTF_ENCODING_PROVIDER.getEncoding();
}
if (Charset.isSupported(jvmEncoding)) {
return new Encoding(Charset.forName(jvmEncoding), logger);
}
return DEFAULT_ENCODING;
}
/**
* Construct an Encoding for a given database encoding.
*
* @param databaseEncoding the name of the database encoding
* @param logger the logger to log the entry for debugging.
* @return an Encoding instance for the specified encoding, or an Encoding instance for the
* default JVM encoding if the specified encoding is unavailable.
*/
public static Encoding getDatabaseEncoding(String databaseEncoding, RedshiftLogger logger) {
if ("UTF8".equals(databaseEncoding)) {
return UTF_ENCODING_PROVIDER.getEncoding();
}
// If the backend encoding is known and there is a suitable
// encoding in the JVM we use that. Otherwise we fall back
// to the default encoding of the JVM.
String[] candidates = encodings.get(databaseEncoding);
if (candidates != null) {
for (String candidate : candidates) {
if(RedshiftLogger.isEnable())
logger.log(LogLevel.DEBUG, "Search encoding candidate {0}", candidate);
if (Charset.isSupported(candidate)) {
return new Encoding(Charset.forName(candidate), logger);
}
}
}
// Try the encoding name directly -- maybe the charset has been
// provided by the user.
if (Charset.isSupported(databaseEncoding)) {
return new Encoding(Charset.forName(databaseEncoding), logger);
}
// Fall back to default JVM encoding.
if(RedshiftLogger.isEnable())
logger.log(LogLevel.DEBUG, "{0} encoding not found, returning default encoding", databaseEncoding);
return DEFAULT_ENCODING;
}
/**
* Get the name of the (JVM) encoding used.
*
* @return the JVM encoding name used by this instance.
*/
public String name() {
return encoding.name();
}
/**
* Encode a string to an array of bytes.
*
* @param s the string to encode
* @return a bytearray containing the encoded string
* @throws IOException if something goes wrong
*/
public byte[] encode(String s) throws IOException {
if (s == null) {
return null;
}
return s.getBytes(encoding);
}
/**
* Decode an array of bytes into a string.
*
* @param encodedString a byte array containing the string to decode
* @param offset the offset in encodedString
of the first byte of the encoded
* representation
* @param length the length, in bytes, of the encoded representation
* @return the decoded string
* @throws IOException if something goes wrong
*/
public String decode(byte[] encodedString, int offset, int length) throws IOException {
return new String(encodedString, offset, length, encoding);
}
/**
* Decode an array of bytes into a string.
*
* @param encodedString a byte array containing the string to decode
* @return the decoded string
* @throws IOException if something goes wrong
*/
public String decode(byte[] encodedString) throws IOException {
return decode(encodedString, 0, encodedString.length);
}
/**
* Get a Reader that decodes the given InputStream using this encoding.
*
* @param in the underlying stream to decode from
* @return a non-null Reader implementation.
* @throws IOException if something goes wrong
*/
public Reader getDecodingReader(InputStream in) throws IOException {
return new InputStreamReader(in, encoding);
}
/**
* Get a Writer that encodes to the given OutputStream using this encoding.
*
* @param out the underlying stream to encode to
* @return a non-null Writer implementation.
* @throws IOException if something goes wrong
*/
public Writer getEncodingWriter(OutputStream out) throws IOException {
return new OutputStreamWriter(out, encoding);
}
/**
* Get an Encoding using the default encoding for the JVM.
*
* @return an Encoding instance
*/
public static Encoding defaultEncoding() {
return DEFAULT_ENCODING;
}
@Override
public String toString() {
return encoding.name();
}
/**
* Checks whether this encoding is compatible with ASCII for the number characters '-' and
* '0'..'9'. Where compatible means that they are encoded with exactly same values.
*
* @return If faster ASCII number parsing can be used with this encoding.
*/
private static boolean testAsciiNumbers(Charset encoding) {
// TODO: test all postgres supported encoding to see if there are
// any which do _not_ have ascii numbers in same location
// at least all the encoding listed in the encodings hashmap have
// working ascii numbers
try {
String test = "-0123456789";
byte[] bytes = test.getBytes(encoding);
String res = new String(bytes, "US-ASCII");
return test.equals(res);
} catch (java.io.UnsupportedEncodingException e) {
return false;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy