org.postgresql.core.Encoding Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mogdb-jdbc Show documentation
Java JDBC driver for MogDB
There is a newer version: 5.0.0.9.pg
/*
 * Copyright (c) 2003, PostgreSQL Global Development Group
 * See the LICENSE file in the project root for more information.
 */

package io.mogdb.core;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.HashMap;
import io.mogdb.log.Logger;
import io.mogdb.log.Log;



/**
 * Representation of a particular character encoding.
 */
public class Encoding {

  private static Log LOGGER = Logger.getLogger(Encoding.class.getName());

  private static final Encoding DEFAULT_ENCODING = new Encoding();
  private static final Encoding UTF8_ENCODING = new Encoding("UTF-8");

  /*
   * Preferred JVM encodings for backend encodings.
   */
  private static final HashMap encodings = new HashMap();

  static {
    //Note: this list should match the set of supported server
    // encodings found in backend/util/mb/encnames.c
    encodings.put("SQL_ASCII", new String[]{"ASCII", "US-ASCII"});
    encodings.put("UNICODE", new String[]{"UTF-8", "UTF8"});
    encodings.put("UTF8", new String[]{"UTF-8", "UTF8"});
    encodings.put("LATIN1", new String[]{"ISO8859_1"});
    encodings.put("LATIN2", new String[]{"ISO8859_2"});
    encodings.put("LATIN3", new String[]{"ISO8859_3"});
    encodings.put("LATIN4", new String[]{"ISO8859_4"});
    encodings.put("ISO_8859_5", new String[]{"ISO8859_5"});
    encodings.put("ISO_8859_6", new String[]{"ISO8859_6"});
    encodings.put("ISO_8859_7", new String[]{"ISO8859_7"});
    encodings.put("ISO_8859_8", new String[]{"ISO8859_8"});
    encodings.put("LATIN5", new String[]{"ISO8859_9"});
    encodings.put("LATIN7", new String[]{"ISO8859_13"});
    encodings.put("LATIN9", new String[]{"ISO8859_15_FDIS"});
    encodings.put("EUC_JP", new String[]{"EUC_JP"});
    encodings.put("EUC_CN", new String[]{"EUC_CN"});
    encodings.put("EUC_KR", new String[]{"EUC_KR"});
    encodings.put("JOHAB", new String[]{"Johab"});
    encodings.put("EUC_TW", new String[]{"EUC_TW"});
    encodings.put("SJIS", new String[]{"MS932", "SJIS"});
    encodings.put("BIG5", new String[]{"Big5", "MS950", "Cp950"});
    encodings.put("GBK", new String[]{"GBK", "MS936"});
    encodings.put("UHC", new String[]{"MS949", "Cp949", "Cp949C"});
    encodings.put("TCVN", new String[]{"Cp1258"});
    encodings.put("WIN1256", new String[]{"Cp1256"});
    encodings.put("WIN1250", new String[]{"Cp1250"});
    encodings.put("WIN874", new String[]{"MS874", "Cp874"});
    encodings.put("WIN", new String[]{"Cp1251"});
    encodings.put("ALT", new String[]{"Cp866"});
    // We prefer KOI8-U, since it is a superset of KOI8-R.
    encodings.put("KOI8", new String[]{"KOI8_U", "KOI8_R"});
    // If the database isn't encoding-aware then we can't have
    // any preferred encodings.
    encodings.put("UNKNOWN", new String[0]);
    // The following encodings do not have a java equivalent
    encodings.put("MULE_INTERNAL", new String[0]);
    encodings.put("LATIN6", new String[0]);
    encodings.put("LATIN8", new String[0]);
    encodings.put("LATIN10", new String[0]);
  }

  private final String encoding;
  private final boolean fastASCIINumbers;

  /**
   * Uses the default charset of the JVM.
   */
  private Encoding() {
    this(Charset.defaultCharset().name());
  }

  /**
   * Use the charset passed as parameter.
   *
   * @param encoding charset name to use
   */
  protected Encoding(String encoding) {
    if (encoding == null) {
      throw new NullPointerException("Null encoding charset not supported");
    }
    this.encoding = encoding;
    fastASCIINumbers = testAsciiNumbers();
  }

  /**
   * Returns true if this encoding has characters '-' and '0'..'9' in exactly same posision as
   * ascii.
   *
   * @return true if the bytes can be scanned directly for ascii numbers.
   */
  public boolean hasAsciiNumbers() {
    return fastASCIINumbers;
  }

  /**
   * Construct an Encoding for a given JVM encoding.
   *
   * @param jvmEncoding the name of the JVM encoding
   * @return an Encoding instance for the specified encoding, or an Encoding instance for the
   *     default JVM encoding if the specified encoding is unavailable.
   */
  public static Encoding getJVMEncoding(String jvmEncoding) {
    if ("UTF-8".equals(jvmEncoding)) {
      return new UTF8Encoding(jvmEncoding);
    }
    if (Charset.isSupported(jvmEncoding)) {
      return new Encoding(jvmEncoding);
    } else {
      return DEFAULT_ENCODING;
    }
  }

  /**
   * Construct an Encoding for a given database encoding.
   *
   * @param databaseEncoding the name of the database encoding
   * @return an Encoding instance for the specified encoding, or an Encoding instance for the
   *     default JVM encoding if the specified encoding is unavailable.
   */
  public static Encoding getDatabaseEncoding(String databaseEncoding) {
    if ("UTF8".equals(databaseEncoding)) {
      return UTF8_ENCODING;
    }
    // If the backend encoding is known and there is a suitable
    // encoding in the JVM we use that. Otherwise we fall back
    // to the default encoding of the JVM.
    String[] candidates = encodings.get(databaseEncoding);
    if (candidates != null) {
      for (String candidate : candidates) {
        LOGGER.trace("Search encoding candidate " + candidate);
        if (Charset.isSupported(candidate)) {
          return new Encoding(candidate);
        }
      }
    }

    // Try the encoding name directly -- maybe the charset has been
    // provided by the user.
    if (Charset.isSupported(databaseEncoding)) {
      return new Encoding(databaseEncoding);
    }

    // Fall back to default JVM encoding.
    LOGGER.trace(databaseEncoding + " encoding not found, returning default encoding");
    return DEFAULT_ENCODING;
  }

  /**
   * Get the name of the (JVM) encoding used.
   *
   * @return the JVM encoding name used by this instance.
   */
  public String name() {
    return Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding;
  }

  /**
   * Encode a string to an array of bytes.
   *
   * @param s the string to encode
   * @return a bytearray containing the encoded string
   * @throws IOException if something goes wrong
   */
  public byte[] encode(String s) throws IOException {
    if (s == null) {
      return null;
    }

    return s.getBytes(encoding);
  }

  /**
   * Decode an array of bytes into a string.
   *
   * @param encodedString a byte array containing the string to decode
   * @param offset        the offset in encodedString of the first byte of the encoded
   *                      representation
   * @param length        the length, in bytes, of the encoded representation
   * @return the decoded string
   * @throws IOException if something goes wrong
   */
  public String decode(byte[] encodedString, int offset, int length) throws IOException {
    return new String(encodedString, offset, length, encoding);
  }

  /**
   * Decode an array of bytes into a string.
   *
   * @param encodedString a byte array containing the string to decode
   * @return the decoded string
   * @throws IOException if something goes wrong
   */
  public String decode(byte[] encodedString) throws IOException {
    return decode(encodedString, 0, encodedString.length);
  }

  /**
   * Get a Reader that decodes the given InputStream using this encoding.
   *
   * @param in the underlying stream to decode from
   * @return a non-null Reader implementation.
   * @throws IOException if something goes wrong
   */
  public Reader getDecodingReader(InputStream in) throws IOException {
    return new InputStreamReader(in, encoding);
  }

  /**
   * Get a Writer that encodes to the given OutputStream using this encoding.
   *
   * @param out the underlying stream to encode to
   * @return a non-null Writer implementation.
   * @throws IOException if something goes wrong
   */
  public Writer getEncodingWriter(OutputStream out) throws IOException {
    return new OutputStreamWriter(out, encoding);
  }

  /**
   * Get an Encoding using the default encoding for the JVM.
   *
   * @return an Encoding instance
   */
  public static Encoding defaultEncoding() {
    return DEFAULT_ENCODING;
  }

  public String toString() {
    return encoding;
  }

  /**
   * Checks weather this encoding is compatible with ASCII for the number characters '-' and
   * '0'..'9'. Where compatible means that they are encoded with exactly same values.
   *
   * @return If faster ASCII number parsing can be used with this encoding.
   */
  private boolean testAsciiNumbers() {
    // TODO: test all postgres supported encoding to see if there are
    // any which do _not_ have ascii numbers in same location
    // at least all the encoding listed in the encodings hashmap have
    // working ascii numbers
    try {
      String test = "-0123456789";
      byte[] bytes = encode(test);
      String res = new String(bytes, "US-ASCII");
      return test.equals(res);
    } catch (java.io.UnsupportedEncodingException e) {
      return false;
    } catch (IOException e) {
      return false;
    }
  }
}