All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.postgresql.core.Encoding Maven / Gradle / Ivy

There is a newer version: 42.7.5
Show newest version
/*
 * Copyright (c) 2003, PostgreSQL Global Development Group
 * See the LICENSE file in the project root for more information.
 */

package org.postgresql.core;

import org.checkerframework.checker.nullness.qual.PolyNull;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Representation of a particular character encoding.
 */
public class Encoding {

  private static final Logger LOGGER = Logger.getLogger(Encoding.class.getName());

  private static final Encoding DEFAULT_ENCODING = new Encoding();

  private static final Encoding UTF8_ENCODING = new Encoding(StandardCharsets.UTF_8, true);

  /*
   * Preferred JVM encodings for backend encodings.
   */
  private static final HashMap encodings = new HashMap();

  static {
    //Note: this list should match the set of supported server
    // encodings found in backend/util/mb/encnames.c
    encodings.put("SQL_ASCII", new String[]{"ASCII", "US-ASCII"});
    encodings.put("UNICODE", new String[]{"UTF-8", "UTF8"});
    encodings.put("UTF8", new String[]{"UTF-8", "UTF8"});
    encodings.put("LATIN1", new String[]{"ISO8859_1"});
    encodings.put("LATIN2", new String[]{"ISO8859_2"});
    encodings.put("LATIN3", new String[]{"ISO8859_3"});
    encodings.put("LATIN4", new String[]{"ISO8859_4"});
    encodings.put("ISO_8859_5", new String[]{"ISO8859_5"});
    encodings.put("ISO_8859_6", new String[]{"ISO8859_6"});
    encodings.put("ISO_8859_7", new String[]{"ISO8859_7"});
    encodings.put("ISO_8859_8", new String[]{"ISO8859_8"});
    encodings.put("LATIN5", new String[]{"ISO8859_9"});
    encodings.put("LATIN7", new String[]{"ISO8859_13"});
    encodings.put("LATIN9", new String[]{"ISO8859_15_FDIS"});
    encodings.put("EUC_JP", new String[]{"EUC_JP"});
    encodings.put("EUC_CN", new String[]{"EUC_CN"});
    encodings.put("EUC_KR", new String[]{"EUC_KR"});
    encodings.put("JOHAB", new String[]{"Johab"});
    encodings.put("EUC_TW", new String[]{"EUC_TW"});
    encodings.put("SJIS", new String[]{"MS932", "SJIS"});
    encodings.put("BIG5", new String[]{"Big5", "MS950", "Cp950"});
    encodings.put("GBK", new String[]{"GBK", "MS936"});
    encodings.put("UHC", new String[]{"MS949", "Cp949", "Cp949C"});
    encodings.put("TCVN", new String[]{"Cp1258"});
    encodings.put("WIN1256", new String[]{"Cp1256"});
    encodings.put("WIN1250", new String[]{"Cp1250"});
    encodings.put("WIN874", new String[]{"MS874", "Cp874"});
    encodings.put("WIN", new String[]{"Cp1251"});
    encodings.put("ALT", new String[]{"Cp866"});
    // We prefer KOI8-U, since it is a superset of KOI8-R.
    encodings.put("KOI8", new String[]{"KOI8_U", "KOI8_R"});
    // If the database isn't encoding-aware then we can't have
    // any preferred encodings.
    encodings.put("UNKNOWN", new String[0]);
    // The following encodings do not have a java equivalent
    encodings.put("MULE_INTERNAL", new String[0]);
    encodings.put("LATIN6", new String[0]);
    encodings.put("LATIN8", new String[0]);
    encodings.put("LATIN10", new String[0]);
  }

  static final AsciiStringInterner INTERNER = new AsciiStringInterner();

  private final Charset encoding;
  private final boolean fastASCIINumbers;

  /**
   * Uses the default charset of the JVM.
   */
  private Encoding() {
    this(Charset.defaultCharset());
  }

  /**
   * Subclasses may use this constructor if they know in advance of their ASCII number
   * compatibility.
   *
   * @param encoding charset to use
   * @param fastASCIINumbers whether this encoding is compatible with ASCII numbers.
   */
  protected Encoding(Charset encoding, boolean fastASCIINumbers) {
    if (encoding == null) {
      throw new NullPointerException("Null encoding charset not supported");
    }
    this.encoding = encoding;
    this.fastASCIINumbers = fastASCIINumbers;
    if (LOGGER.isLoggable(Level.FINEST)) {
      LOGGER.log(Level.FINEST, "Creating new Encoding {0} with fastASCIINumbers {1}",
          new Object[]{encoding, fastASCIINumbers});
    }
  }

  /**
   * Use the charset passed as parameter and tests at creation time whether the specified encoding
   * is compatible with ASCII numbers.
   *
   * @param encoding charset to use
   */
  protected Encoding(Charset encoding) {
    this(encoding, testAsciiNumbers(encoding));
  }

  /**
   * Returns true if this encoding has characters '-' and '0'..'9' in exactly same posision as
   * ascii.
   *
   * @return true if the bytes can be scanned directly for ascii numbers.
   */
  public boolean hasAsciiNumbers() {
    return fastASCIINumbers;
  }

  /**
   * Construct an Encoding for a given JVM encoding.
   *
   * @param jvmEncoding the name of the JVM encoding
   * @return an Encoding instance for the specified encoding, or an Encoding instance for the
   *     default JVM encoding if the specified encoding is unavailable.
   */
  public static Encoding getJVMEncoding(String jvmEncoding) {
    if ("UTF-8".equals(jvmEncoding)) {
      return UTF8_ENCODING;
    }
    if (Charset.isSupported(jvmEncoding)) {
      return new Encoding(Charset.forName(jvmEncoding));
    }
    return DEFAULT_ENCODING;
  }

  /**
   * Construct an Encoding for a given database encoding.
   *
   * @param databaseEncoding the name of the database encoding
   * @return an Encoding instance for the specified encoding, or an Encoding instance for the
   *     default JVM encoding if the specified encoding is unavailable.
   */
  public static Encoding getDatabaseEncoding(String databaseEncoding) {
    if ("UTF8".equals(databaseEncoding) || "UNICODE".equals(databaseEncoding)) {
      return UTF8_ENCODING;
    }
    // If the backend encoding is known and there is a suitable
    // encoding in the JVM we use that. Otherwise we fall back
    // to the default encoding of the JVM.
    String[] candidates = encodings.get(databaseEncoding);
    if (candidates != null) {
      for (String candidate : candidates) {
        LOGGER.log(Level.FINEST, "Search encoding candidate {0}", candidate);
        if (Charset.isSupported(candidate)) {
          return new Encoding(Charset.forName(candidate));
        }
      }
    }

    // Try the encoding name directly -- maybe the charset has been
    // provided by the user.
    if (Charset.isSupported(databaseEncoding)) {
      return new Encoding(Charset.forName(databaseEncoding));
    }

    // Fall back to default JVM encoding.
    LOGGER.log(Level.FINEST, "{0} encoding not found, returning default encoding", databaseEncoding);
    return DEFAULT_ENCODING;
  }

  /**
   * Indicates that string should be staged as a canonicalized value.
   *
   * 

* This is intended for use with {@code String} constants. *

* * @param string The string to maintain canonicalized reference to. Must not be {@code null}. * @see Encoding#decodeCanonicalized(byte[], int, int) */ public static void canonicalize(String string) { INTERNER.putString(string); } /** * Get the name of the (JVM) encoding used. * * @return the JVM encoding name used by this instance. */ public String name() { return encoding.name(); } /** * Encode a string to an array of bytes. * * @param s the string to encode * @return a bytearray containing the encoded string * @throws IOException if something goes wrong */ public byte @PolyNull [] encode(@PolyNull String s) throws IOException { if (s == null) { return null; } return s.getBytes(encoding); } /** * Decode an array of bytes possibly into a canonicalized string. * *

* Only ascii compatible encoding support canonicalization and only ascii {@code String} values are eligible * to be canonicalized. *

* * @param encodedString a byte array containing the string to decode * @param offset the offset in encodedString of the first byte of the encoded * representation * @param length the length, in bytes, of the encoded representation * @return the decoded string * @throws IOException if something goes wrong */ public String decodeCanonicalized(byte[] encodedString, int offset, int length) throws IOException { if (length == 0) { return ""; } // if fastASCIINumbers is false, then no chance of the byte[] being ascii compatible characters return fastASCIINumbers ? INTERNER.getString(encodedString, offset, length, this) : decode(encodedString, offset, length); } public String decodeCanonicalizedIfPresent(byte[] encodedString, int offset, int length) throws IOException { if (length == 0) { return ""; } // if fastASCIINumbers is false, then no chance of the byte[] being ascii compatible characters return fastASCIINumbers ? INTERNER.getStringIfPresent(encodedString, offset, length, this) : decode(encodedString, offset, length); } /** * Decode an array of bytes possibly into a canonicalized string. * *

* Only ascii compatible encoding support canonicalization and only ascii {@code String} values are eligible * to be canonicalized. *

* * @param encodedString a byte array containing the string to decode * @return the decoded string * @throws IOException if something goes wrong */ public String decodeCanonicalized(byte[] encodedString) throws IOException { return decodeCanonicalized(encodedString, 0, encodedString.length); } /** * Decode an array of bytes into a string. * * @param encodedString a byte array containing the string to decode * @param offset the offset in encodedString of the first byte of the encoded * representation * @param length the length, in bytes, of the encoded representation * @return the decoded string * @throws IOException if something goes wrong */ public String decode(byte[] encodedString, int offset, int length) throws IOException { return new String(encodedString, offset, length, encoding); } /** * Decode an array of bytes into a string. * * @param encodedString a byte array containing the string to decode * @return the decoded string * @throws IOException if something goes wrong */ public String decode(byte[] encodedString) throws IOException { return decode(encodedString, 0, encodedString.length); } /** * Get a Reader that decodes the given InputStream using this encoding. * * @param in the underlying stream to decode from * @return a non-null Reader implementation. * @throws IOException if something goes wrong */ public Reader getDecodingReader(InputStream in) throws IOException { return new InputStreamReader(in, encoding); } /** * Get a Writer that encodes to the given OutputStream using this encoding. * * @param out the underlying stream to encode to * @return a non-null Writer implementation. * @throws IOException if something goes wrong */ public Writer getEncodingWriter(OutputStream out) throws IOException { return new OutputStreamWriter(out, encoding); } /** * Get an Encoding using the default encoding for the JVM. * * @return an Encoding instance */ public static Encoding defaultEncoding() { return DEFAULT_ENCODING; } @Override public String toString() { return encoding.name(); } /** * Checks whether this encoding is compatible with ASCII for the number characters '-' and * '0'..'9'. Where compatible means that they are encoded with exactly same values. * * @return If faster ASCII number parsing can be used with this encoding. */ private static boolean testAsciiNumbers(Charset encoding) { // TODO: test all postgres supported encoding to see if there are // any which do _not_ have ascii numbers in same location // at least all the encoding listed in the encodings hashmap have // working ascii numbers String test = "-0123456789"; byte[] bytes = test.getBytes(encoding); String res = new String(bytes, StandardCharsets.US_ASCII); return test.equals(res); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy