org.apache.commons.jre.javaemul.internal.EmulatedCharset Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gwt-commons-codec Show documentation
The Apache Commons Codec component contains encoders and decoders for various formats such as Base16, Base32, Base64, digest, and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities. This is a port for GWT, which enables program, to use Apache Commons Codec also in the frontend compiled by the gwt compiler to java-script.
There is a newer version: 1.17.1-0
Show newest version
/*
 * Copyright 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package javaemul.internal;

import java.nio.charset.Charset;

/**
 * Provides Charset implementations.
 */
public abstract class EmulatedCharset extends Charset {

  public static final EmulatedCharset UTF_8 = new UtfCharset("UTF-8");

  public static final EmulatedCharset ISO_LATIN_1 = new LatinCharset("ISO-LATIN-1");

  public static final EmulatedCharset ISO_8859_1 = new LatinCharset("ISO-8859-1");

  private static class LatinCharset extends EmulatedCharset {
    public LatinCharset(String name) {
      super(name);
    }

    @Override
    public byte[] getBytes(String str) {
      int n = str.length();
      byte[] bytes = new byte[n];
      for (int i = 0; i < n; ++i) {
        bytes[i] = (byte) (str.charAt(i) & 255);
      }
      return bytes;
    }

    @Override
    public char[] decodeString(byte[] bytes, int ofs, int len) {
      char[] chars = new char[len];
      for (int i = 0; i < len; ++i) {
        chars[i] = (char) (bytes[ofs + i] & 255);
      }
      return chars;
    }
  }

  private static class UtfCharset extends EmulatedCharset {
    public UtfCharset(String name) {
      super(name);
    }

    @Override
    public char[] decodeString(byte[] bytes, int ofs, int len) {
      // TODO(jat): consider using decodeURIComponent(escape(bytes)) instead
      int charCount = 0;
      for (int i = 0; i < len; ) {
        ++charCount;
        byte ch = bytes[ofs + i];
        if ((ch & 0xC0) == 0x80) {
          throw new IllegalArgumentException("Invalid UTF8 sequence");
        } else if ((ch & 0x80) == 0) {
          ++i;
        } else if ((ch & 0xE0) == 0xC0) {
          i += 2;
        } else if ((ch & 0xF0) == 0xE0) {
          i += 3;
        } else if ((ch & 0xF8) == 0xF0) {
          i += 4;
        } else {
          // no 5+ byte sequences since max codepoint is less than 2^21
          throw new IllegalArgumentException("Invalid UTF8 sequence");
        }
        if (i > len) {
          throw new IndexOutOfBoundsException("Invalid UTF8 sequence");
        }
      }
      char[] chars = new char[charCount];
      int outIdx = 0;
      int count = 0;
      for (int i = 0; i < len; ) {
        int ch = bytes[ofs + i++];
        if ((ch & 0x80) == 0) {
          count = 1;
          ch &= 127;
        } else if ((ch & 0xE0) == 0xC0) {
          count = 2;
          ch &= 31;
        } else if ((ch & 0xF0) == 0xE0) {
          count = 3;
          ch &= 15;
        } else if ((ch & 0xF8) == 0xF0) {
          count = 4;
          ch &= 7;
        } else if ((ch & 0xFC) == 0xF8) {
          count = 5;
          ch &= 3;
        }
        while (--count > 0) {
          byte b = bytes[ofs + i++];
          if ((b & 0xC0) != 0x80) {
            throw new IllegalArgumentException("Invalid UTF8 sequence at "
                + (ofs + i - 1) + ", byte=" + Integer.toHexString(b));
          }
          ch = (ch << 6) | (b & 63);
        }
        outIdx += Character.toChars(ch, chars, outIdx);
      }
      return chars;
    }

    @Override
    public byte[] getBytes(String str) {
      // TODO(jat): consider using unescape(encodeURIComponent(bytes)) instead
      int n = str.length();
      int byteCount = 0;
      for (int i = 0; i < n;) {
        int ch = str.codePointAt(i);
        i += Character.charCount(ch);
        if (ch < (1 << 7)) {
          byteCount++;
        } else if (ch < (1 << 11)) {
          byteCount += 2;
        } else if (ch < (1 << 16)) {
          byteCount += 3;
        } else if (ch < (1 << 21)) {
          byteCount += 4;
        } else if (ch < (1 << 26)) {
          byteCount += 5;
        }
      }
      byte[] bytes = new byte[byteCount];
      int out = 0;
      for (int i = 0; i < n;) {
        int ch = str.codePointAt(i);
        i += Character.charCount(ch);
        out += encodeUtf8(bytes, out, ch);
      }
      return bytes;
    }

    /**
     * Encode a single character in UTF8.
     *
     * @param bytes byte array to store character in
     * @param ofs offset into byte array to store first byte
     * @param codePoint character to encode
     * @return number of bytes consumed by encoding the character
     * @throws IllegalArgumentException if codepoint >= 2^26
     */
    private int encodeUtf8(byte[] bytes, int ofs, int codePoint) {
      if (codePoint < (1 << 7)) {
        bytes[ofs] = (byte) (codePoint & 127);
        return 1;
      } else if (codePoint < (1 << 11)) {
        // 110xxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 6) & 31) | 0xC0);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 2;
      } else if (codePoint < (1 << 16)) {
        // 1110xxxx 10xxxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 12) & 15) | 0xE0);
        bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 3;
      } else if (codePoint < (1 << 21)) {
        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 18) & 7) | 0xF0);
        bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80);
        bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 4;
      } else if (codePoint < (1 << 26)) {
        // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 24) & 3) | 0xF8);
        bytes[ofs++] = (byte) (((codePoint >> 18) & 63) | 0x80);
        bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80);
        bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 5;
      }
      throw new IllegalArgumentException("Character out of range: " + codePoint);
    }
  }

  public EmulatedCharset(String name) {
    super(name, null);
  }

  public abstract byte[] getBytes(String string);

  public abstract char[] decodeString(byte[] bytes, int ofs, int len);
}