All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.jre.javaemul.internal.EmulatedCharset Maven / Gradle / Ivy

Go to download

Apache Commons Lang, a package of Java utility classes for the classes that are in java.lang's hierarchy, or are considered to be so standard as to justify existence in java.lang. This is a port for GWT, which enables program, to use Apache Commons Lang also in the frontend compiled by the gwt compiler to java-script. The code is tested using the latest revision of the JDK for supported LTS releases: 8, 11, 17 and 21 currently. See https://github.com/apache/commons-lang/blob/master/.github/workflows/maven.yml Please ensure your build environment is up-to-date and kindly report any build issues.

There is a newer version: 3.17.0-0
Show newest version
/*
 * Copyright 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package javaemul.internal;

import java.nio.charset.Charset;

/**
 * Provides Charset implementations.
 */
public abstract class EmulatedCharset extends Charset {

  public static final EmulatedCharset UTF_8 = new UtfCharset("UTF-8");

  public static final EmulatedCharset ISO_LATIN_1 = new LatinCharset("ISO-LATIN-1");

  public static final EmulatedCharset ISO_8859_1 = new LatinCharset("ISO-8859-1");

  private static class LatinCharset extends EmulatedCharset {
    public LatinCharset(String name) {
      super(name);
    }

    @Override
    public byte[] getBytes(String str) {
      int n = str.length();
      byte[] bytes = new byte[n];
      for (int i = 0; i < n; ++i) {
        bytes[i] = (byte) (str.charAt(i) & 255);
      }
      return bytes;
    }

    @Override
    public char[] decodeString(byte[] bytes, int ofs, int len) {
      char[] chars = new char[len];
      for (int i = 0; i < len; ++i) {
        chars[i] = (char) (bytes[ofs + i] & 255);
      }
      return chars;
    }
  }

  private static class UtfCharset extends EmulatedCharset {
    public UtfCharset(String name) {
      super(name);
    }

    @Override
    public char[] decodeString(byte[] bytes, int ofs, int len) {
      // TODO(jat): consider using decodeURIComponent(escape(bytes)) instead
      int charCount = 0;
      for (int i = 0; i < len; ) {
        ++charCount;
        byte ch = bytes[ofs + i];
        if ((ch & 0xC0) == 0x80) {
          throw new IllegalArgumentException("Invalid UTF8 sequence");
        } else if ((ch & 0x80) == 0) {
          ++i;
        } else if ((ch & 0xE0) == 0xC0) {
          i += 2;
        } else if ((ch & 0xF0) == 0xE0) {
          i += 3;
        } else if ((ch & 0xF8) == 0xF0) {
          i += 4;
        } else {
          // no 5+ byte sequences since max codepoint is less than 2^21
          throw new IllegalArgumentException("Invalid UTF8 sequence");
        }
        if (i > len) {
          throw new IndexOutOfBoundsException("Invalid UTF8 sequence");
        }
      }
      char[] chars = new char[charCount];
      int outIdx = 0;
      int count = 0;
      for (int i = 0; i < len; ) {
        int ch = bytes[ofs + i++];
        if ((ch & 0x80) == 0) {
          count = 1;
          ch &= 127;
        } else if ((ch & 0xE0) == 0xC0) {
          count = 2;
          ch &= 31;
        } else if ((ch & 0xF0) == 0xE0) {
          count = 3;
          ch &= 15;
        } else if ((ch & 0xF8) == 0xF0) {
          count = 4;
          ch &= 7;
        } else if ((ch & 0xFC) == 0xF8) {
          count = 5;
          ch &= 3;
        }
        while (--count > 0) {
          byte b = bytes[ofs + i++];
          if ((b & 0xC0) != 0x80) {
            throw new IllegalArgumentException("Invalid UTF8 sequence at "
                + (ofs + i - 1) + ", byte=" + Integer.toHexString(b));
          }
          ch = (ch << 6) | (b & 63);
        }
        outIdx += Character.toChars(ch, chars, outIdx);
      }
      return chars;
    }

    @Override
    public byte[] getBytes(String str) {
      // TODO(jat): consider using unescape(encodeURIComponent(bytes)) instead
      int n = str.length();
      int byteCount = 0;
      for (int i = 0; i < n;) {
        int ch = str.codePointAt(i);
        i += Character.charCount(ch);
        if (ch < (1 << 7)) {
          byteCount++;
        } else if (ch < (1 << 11)) {
          byteCount += 2;
        } else if (ch < (1 << 16)) {
          byteCount += 3;
        } else if (ch < (1 << 21)) {
          byteCount += 4;
        } else if (ch < (1 << 26)) {
          byteCount += 5;
        }
      }
      byte[] bytes = new byte[byteCount];
      int out = 0;
      for (int i = 0; i < n;) {
        int ch = str.codePointAt(i);
        i += Character.charCount(ch);
        out += encodeUtf8(bytes, out, ch);
      }
      return bytes;
    }

    /**
     * Encode a single character in UTF8.
     *
     * @param bytes byte array to store character in
     * @param ofs offset into byte array to store first byte
     * @param codePoint character to encode
     * @return number of bytes consumed by encoding the character
     * @throws IllegalArgumentException if codepoint >= 2^26
     */
    private int encodeUtf8(byte[] bytes, int ofs, int codePoint) {
      if (codePoint < (1 << 7)) {
        bytes[ofs] = (byte) (codePoint & 127);
        return 1;
      } else if (codePoint < (1 << 11)) {
        // 110xxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 6) & 31) | 0xC0);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 2;
      } else if (codePoint < (1 << 16)) {
        // 1110xxxx 10xxxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 12) & 15) | 0xE0);
        bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 3;
      } else if (codePoint < (1 << 21)) {
        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 18) & 7) | 0xF0);
        bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80);
        bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 4;
      } else if (codePoint < (1 << 26)) {
        // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        bytes[ofs++] = (byte) (((codePoint >> 24) & 3) | 0xF8);
        bytes[ofs++] = (byte) (((codePoint >> 18) & 63) | 0x80);
        bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80);
        bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
        bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
        return 5;
      }
      throw new IllegalArgumentException("Character out of range: " + codePoint);
    }
  }

  public EmulatedCharset(String name) {
    super(name, null);
  }

  public abstract byte[] getBytes(String string);

  public abstract char[] decodeString(byte[] bytes, int ofs, int len);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy