com.amazon.redshift.core.Encoding Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of redshift-jdbc42 Show documentation
Java JDBC 4.2 (JRE 8+) driver for Redshift database
There is a newer version: 2.1.0.30
/*
 * Copyright (c) 2003, PostgreSQL Global Development Group
 * See the LICENSE file in the project root for more information.
 */

package com.amazon.redshift.core;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.HashMap;

import com.amazon.redshift.logger.LogLevel;
import com.amazon.redshift.logger.RedshiftLogger;

/**
 * Representation of a particular character encoding.
 */
public class Encoding {

  protected RedshiftLogger logger;

  private static final Encoding DEFAULT_ENCODING = new Encoding();

  /*
   * Preferred JVM encodings for backend encodings.
   */
  private static final HashMap encodings = new HashMap();

  static {
    //Note: this list should match the set of supported server
    // encodings found in backend/util/mb/encnames.c
    encodings.put("SQL_ASCII", new String[]{"ASCII", "US-ASCII"});
    encodings.put("UNICODE", new String[]{"UTF-8", "UTF8"});
    encodings.put("UTF8", new String[]{"UTF-8", "UTF8"});
    encodings.put("LATIN1", new String[]{"ISO8859_1"});
    encodings.put("LATIN2", new String[]{"ISO8859_2"});
    encodings.put("LATIN3", new String[]{"ISO8859_3"});
    encodings.put("LATIN4", new String[]{"ISO8859_4"});
    encodings.put("ISO_8859_5", new String[]{"ISO8859_5"});
    encodings.put("ISO_8859_6", new String[]{"ISO8859_6"});
    encodings.put("ISO_8859_7", new String[]{"ISO8859_7"});
    encodings.put("ISO_8859_8", new String[]{"ISO8859_8"});
    encodings.put("LATIN5", new String[]{"ISO8859_9"});
    encodings.put("LATIN7", new String[]{"ISO8859_13"});
    encodings.put("LATIN9", new String[]{"ISO8859_15_FDIS"});
    encodings.put("EUC_JP", new String[]{"EUC_JP"});
    encodings.put("EUC_CN", new String[]{"EUC_CN"});
    encodings.put("EUC_KR", new String[]{"EUC_KR"});
    encodings.put("JOHAB", new String[]{"Johab"});
    encodings.put("EUC_TW", new String[]{"EUC_TW"});
    encodings.put("SJIS", new String[]{"MS932", "SJIS"});
    encodings.put("BIG5", new String[]{"Big5", "MS950", "Cp950"});
    encodings.put("GBK", new String[]{"GBK", "MS936"});
    encodings.put("UHC", new String[]{"MS949", "Cp949", "Cp949C"});
    encodings.put("TCVN", new String[]{"Cp1258"});
    encodings.put("WIN1256", new String[]{"Cp1256"});
    encodings.put("WIN1250", new String[]{"Cp1250"});
    encodings.put("WIN874", new String[]{"MS874", "Cp874"});
    encodings.put("WIN", new String[]{"Cp1251"});
    encodings.put("ALT", new String[]{"Cp866"});
    // We prefer KOI8-U, since it is a superset of KOI8-R.
    encodings.put("KOI8", new String[]{"KOI8_U", "KOI8_R"});
    // If the database isn't encoding-aware then we can't have
    // any preferred encodings.
    encodings.put("UNKNOWN", new String[0]);
    // The following encodings do not have a java equivalent
    encodings.put("MULE_INTERNAL", new String[0]);
    encodings.put("LATIN6", new String[0]);
    encodings.put("LATIN8", new String[0]);
    encodings.put("LATIN10", new String[0]);
  }

  private interface UTFEncodingProvider {
    Encoding getEncoding();
  }

  private static final UTFEncodingProvider UTF_ENCODING_PROVIDER;

  static {
    //for java 1.8 and older, use implementation optimized for char[]
    final JavaVersion runtimeVersion = JavaVersion.getRuntimeVersion();
    if (JavaVersion.v1_8.compareTo(runtimeVersion) >= 0) {
      UTF_ENCODING_PROVIDER = new UTFEncodingProvider() {
        @Override
        public Encoding getEncoding() {
          return new CharOptimizedUTF8Encoder();
        }
      };
    } else {
      //for newer versions, use default java behavior
      UTF_ENCODING_PROVIDER = new UTFEncodingProvider() {
        @Override
        public Encoding getEncoding() {
          return new ByteOptimizedUTF8Encoder();
        }
      };
    }
  }

  private final Charset encoding;
  private final boolean fastASCIINumbers;

  /**
   * Uses the default charset of the JVM.
   */
  private Encoding() {
    this(Charset.defaultCharset(), RedshiftLogger.getDriverLogger());
  }

  /**
   * Subclasses may use this constructor if they know in advance of their ASCII number
   * compatibility.
   *
   * @param encoding charset to use
   * @param logger the logger to log the entry for debugging.       
   * @param fastASCIINumbers whether this encoding is compatible with ASCII numbers.
   */
  protected Encoding(Charset encoding, boolean fastASCIINumbers, RedshiftLogger logger) {
  	this.logger = logger;
    if (encoding == null) {
      throw new NullPointerException("Null encoding charset not supported");
    }
    this.encoding = encoding;
    this.fastASCIINumbers = fastASCIINumbers;
    if (RedshiftLogger.isEnable()) {
      this.logger.log(LogLevel.DEBUG, "Creating new Encoding {0} with fastASCIINumbers {1}",
          new Object[]{encoding, fastASCIINumbers});
    }
  }

  /**
   * Use the charset passed as parameter and tests at creation time whether the specified encoding
   * is compatible with ASCII numbers.
   *
   * @param encoding charset to use
   * @param logger the logger to log the entry for debugging.       
   */
  protected Encoding(Charset encoding, RedshiftLogger logger) {
    this(encoding, testAsciiNumbers(encoding), logger);
  }

  /**
   * Returns true if this encoding has characters '-' and '0'..'9' in exactly same posision as
   * ascii.
   *
   * @return true if the bytes can be scanned directly for ascii numbers.
   */
  public boolean hasAsciiNumbers() {
    return fastASCIINumbers;
  }

  /**
   * Construct an Encoding for a given JVM encoding.
   *
   * @param jvmEncoding the name of the JVM encoding
   * @param logger the logger to log the entry for debugging.       
   * @return an Encoding instance for the specified encoding, or an Encoding instance for the
   *     default JVM encoding if the specified encoding is unavailable.
   */
  public static Encoding getJVMEncoding(String jvmEncoding, RedshiftLogger logger) {
    if ("UTF-8".equals(jvmEncoding)) {
      return UTF_ENCODING_PROVIDER.getEncoding();
    }
    if (Charset.isSupported(jvmEncoding)) {
      return new Encoding(Charset.forName(jvmEncoding), logger);
    }
    return DEFAULT_ENCODING;
  }

  /**
   * Construct an Encoding for a given database encoding.
   *
   * @param databaseEncoding the name of the database encoding
   * @param logger the logger to log the entry for debugging.       
   * @return an Encoding instance for the specified encoding, or an Encoding instance for the
   *     default JVM encoding if the specified encoding is unavailable.
   */
  public static Encoding getDatabaseEncoding(String databaseEncoding, RedshiftLogger logger) {
    if ("UTF8".equals(databaseEncoding)) {
      return UTF_ENCODING_PROVIDER.getEncoding();
    }
    // If the backend encoding is known and there is a suitable
    // encoding in the JVM we use that. Otherwise we fall back
    // to the default encoding of the JVM.
    String[] candidates = encodings.get(databaseEncoding);
    if (candidates != null) {
      for (String candidate : candidates) {
      	if(RedshiftLogger.isEnable())
      		logger.log(LogLevel.DEBUG, "Search encoding candidate {0}", candidate);
        if (Charset.isSupported(candidate)) {
          return new Encoding(Charset.forName(candidate), logger);
        }
      }
    }

    // Try the encoding name directly -- maybe the charset has been
    // provided by the user.
    if (Charset.isSupported(databaseEncoding)) {
      return new Encoding(Charset.forName(databaseEncoding), logger);
    }

    // Fall back to default JVM encoding.
    if(RedshiftLogger.isEnable())
    	logger.log(LogLevel.DEBUG, "{0} encoding not found, returning default encoding", databaseEncoding);
    return DEFAULT_ENCODING;
  }

  /**
   * Get the name of the (JVM) encoding used.
   *
   * @return the JVM encoding name used by this instance.
   */
  public String name() {
    return encoding.name();
  }

  /**
   * Encode a string to an array of bytes.
   *
   * @param s the string to encode
   * @return a bytearray containing the encoded string
   * @throws IOException if something goes wrong
   */
  public byte[] encode(String s) throws IOException {
    if (s == null) {
      return null;
    }

    return s.getBytes(encoding);
  }

  /**
   * Decode an array of bytes into a string.
   *
   * @param encodedString a byte array containing the string to decode
   * @param offset        the offset in encodedString of the first byte of the encoded
   *                      representation
   * @param length        the length, in bytes, of the encoded representation
   * @return the decoded string
   * @throws IOException if something goes wrong
   */
  public String decode(byte[] encodedString, int offset, int length) throws IOException {
    return new String(encodedString, offset, length, encoding);
  }

  /**
   * Decode an array of bytes into a string.
   *
   * @param encodedString a byte array containing the string to decode
   * @return the decoded string
   * @throws IOException if something goes wrong
   */
  public String decode(byte[] encodedString) throws IOException {
    return decode(encodedString, 0, encodedString.length);
  }

  /**
   * Get a Reader that decodes the given InputStream using this encoding.
   *
   * @param in the underlying stream to decode from
   * @return a non-null Reader implementation.
   * @throws IOException if something goes wrong
   */
  public Reader getDecodingReader(InputStream in) throws IOException {
    return new InputStreamReader(in, encoding);
  }

  /**
   * Get a Writer that encodes to the given OutputStream using this encoding.
   *
   * @param out the underlying stream to encode to
   * @return a non-null Writer implementation.
   * @throws IOException if something goes wrong
   */
  public Writer getEncodingWriter(OutputStream out) throws IOException {
    return new OutputStreamWriter(out, encoding);
  }

  /**
   * Get an Encoding using the default encoding for the JVM.
   *
   * @return an Encoding instance
   */
  public static Encoding defaultEncoding() {
    return DEFAULT_ENCODING;
  }

  @Override
  public String toString() {
    return encoding.name();
  }

  /**
   * Checks whether this encoding is compatible with ASCII for the number characters '-' and
   * '0'..'9'. Where compatible means that they are encoded with exactly same values.
   *
   * @return If faster ASCII number parsing can be used with this encoding.
   */
  private static boolean testAsciiNumbers(Charset encoding) {
    // TODO: test all postgres supported encoding to see if there are
    // any which do _not_ have ascii numbers in same location
    // at least all the encoding listed in the encodings hashmap have
    // working ascii numbers
    try {
      String test = "-0123456789";
      byte[] bytes = test.getBytes(encoding);
      String res = new String(bytes, "US-ASCII");
      return test.equals(res);
    } catch (java.io.UnsupportedEncodingException e) {
      return false;
    }
  }
}