net.sf.mmm.util.io.api.EncodingUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmm-util-io Show documentation
Utilities for input/output and streaming.
The newest version!
/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0
 * http://www.apache.org/licenses/LICENSE-2.0 */
package net.sf.mmm.util.io.api;

import java.io.InputStream;

/**
 * This is the interface for a collection of utility functions to that help deal with encodings. An
 * encoding defines a mapping of {@link Character}s of a {@link java.nio.charset.Charset} to {@link Byte}s and
 * vice versa.
 *
 * @see net.sf.mmm.util.io.base.EncodingUtilImpl
 *
 * @author Joerg Hohwiller (hohwille at users.sourceforge.net)
 * @since 1.0.1
 */
public interface EncodingUtil {

  /**
   * The default encoding used by this JVM as fallback if no explicit encoding is specified.
   */
  String SYSTEM_DEFAULT_ENCODING = System.getProperty("file.encoding");

  /**
   * The encoding {@code US-ASCII} (American Standard Code for Information Interchange) also just called
   * {@code ASCII}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_US_ASCII = "US-ASCII";

  /**
   * The encoding {@code UTF-8}. It is an 8-bit Unicode Transformation Format. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_UTF_8 = "UTF-8";

  /**
   * The encoding {@code UTF-16}. It is an 16-bit Unicode Transformation Format. The byte-order is determined
   * by an optional {@link ByteOrderMark}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_UTF_16 = "UTF-16";

  /**
   * The encoding {@code UTF-16, little-endian}. It is an 16-bit Unicode Transformation Format. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_UTF_16_LE = "UTF-16LE";

  /**
   * The encoding {@code UTF-16, big-endian}. It is an 16-bit Unicode Transformation Format. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_UTF_16_BE = "UTF-16BE";

  /**
   * The encoding {@code UTF-32}. It is an 32-bit Unicode Transformation Format. The byte-order is determined
   * by an optional {@link ByteOrderMark}. 

   * ATTENTION:

   * UTF-32 is NOT yet supported by Java.
   */
  String ENCODING_UTF_32 = "UTF-32";

  /**
   * The encoding {@code UTF-32, little-endian}. It is an 32-bit Unicode Transformation Format. 

   * ATTENTION:

   * UTF-32 is NOT yet supported by Java.
   */
  String ENCODING_UTF_32_LE = "UTF-32LE";

  /**
   * The encoding {@code UTF-32, big-endian}. It is an 32-bit Unicode Transformation Format. 

   * ATTENTION:

   * UTF-32 is NOT yet supported by Java.
   */
  String ENCODING_UTF_32_BE = "UTF-32BE";

  /**
   * The encoding {@code ISO-8859-1} also called {@code Latin-1}. It is covering most Western European
   * languages. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_1 = "ISO-8859-1";

  /**
   * The encoding {@code ISO-8859-2} also called {@code Latin-2}. It is covering the Central and Eastern
   * European languages that use the Latin alphabet. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_2 = "ISO-8859-2";

  /**
   * The encoding {@code ISO-8859-3} also called {@code Latin-3}. It is covering the South European languages.
   * 

   * This is an extended encoding for Java contained in {@code lib/charsets.jar}.
   */
  String ENCODING_ISO_8859_3 = "ISO-8859-3";

  /**
   * The encoding {@code ISO-8859-4} also called {@code Latin-4}. It is covering the North European languages.
   * 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_4 = "ISO-8859-4";

  /**
   * The encoding {@code ISO-8859-5}. It is covering mostly Slavic languages that use a Cyrillic alphabet.
   * 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_5 = "ISO-8859-5";

  /**
   * The encoding {@code ISO-8859-6}. It is covering common Arabic language characters. 

   * This is an extended encoding for Java contained in {@code lib/charsets.jar}.
   */
  String ENCODING_ISO_8859_6 = "ISO-8859-6";

  /**
   * The encoding {@code ISO-8859-7}. It is covering modern Greek. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_7 = "ISO-8859-7";

  /**
   * The encoding {@code ISO-8859-8}. It is covering modern Hebrew (used in Israel). 

   * This is an extended encoding for Java contained in {@code lib/charsets.jar}.
   */
  String ENCODING_ISO_8859_8 = "ISO-8859-8";

  /**
   * The encoding {@code ISO-8859-9} also called {@code Latin-5}. It is covering Turkish and Kurdish. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_9 = "ISO-8859-9";

  /**
   * The encoding {@code ISO-8859-10} also called {@code Latin-6}. It is used for Nordic languages. 

   * ATTENTION:

   * This encoding is NOT supported by Java.
   */
  String ENCODING_ISO_8859_10 = "ISO-8859-10";

  /**
   * The encoding {@code ISO-8859-11}. The {@link java.nio.charset.Charset#name() canonical name} however is
   * {@code x-iso-8859-11}. It is covering common Thai language characters.
   */
  String ENCODING_ISO_8859_11 = "x-iso-8859-11";

  /**
   * The encoding {@code ISO-8859-12}. The work on this encoding for Devanagari was stopped so it does NOT
   * exist at all.
   */
  @Deprecated
  String ENCODING_ISO_8859_12 = "ISO-8859-12";

  /**
   * The encoding {@code ISO-8859-13} also called {@code Latin-7}. It is covering Baltic languages. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_13 = "ISO-8859-13";

  /**
   * The encoding {@code ISO-8859-14} also called {@code Latin-8}. It is covering Celtic languages. 

   * This encoding is NOT supported by Java.
   */
  String ENCODING_ISO_8859_14 = "ISO-8859-14";

  /**
   * The encoding {@code ISO-8859-15} also called {@code Latin-9}. It is very similar to
   * {@link #ENCODING_ISO_8859_1 Latin-1} but adds the euro-sign and 7 other characters by replacing rarely
   * used ones. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_ISO_8859_15 = "ISO-8859-15";

  /**
   * The encoding {@code ISO-8859-16} also called {@code Latin-10}. It is covering South-Eastern European
   * languages and includes the euro-sign. 

   * This encoding is NOT supported by Java.
   */
  String ENCODING_ISO_8859_16 = "ISO-8859-16";

  /**
   * The encoding {@code KOI8-R}. It is covering Russian and Bulgarian. It is therefore related to
   * {@link #ENCODING_ISO_8859_5} and {@link #ENCODING_WINDOWS_1251}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_KOI8_R = "KOI8-R";

  /**
   * The encoding {@code KOI8-U}. It is covering Ukrainian. It is related to {@link #ENCODING_KOI8_R},
   * {@link #ENCODING_ISO_8859_5} and {@link #ENCODING_WINDOWS_1251}. 

   * ATTENTION:

   * This encoding is NOT supported by Java.
   */
  String ENCODING_KOI8_U = "KOI8-U";

  /**
   * The encoding {@code CP437} also called {@code DOS-US}. It is used by MS-DOS and is based on
   * {@link #ENCODING_US_ASCII} but NOT completely compatible.
   */
  String ENCODING_CP_437 = "IBM437";

  /**
   * The encoding {@code CP737}. It is used by MS-DOS for Greek and is therefore related to
   * {@link #ENCODING_CP_869} and {@link #ENCODING_ISO_8859_7}.
   */
  String ENCODING_CP_737 = "x-IBM737";

  /**
   * The encoding {@code CP850}. It is used by MS-DOS for Western European languages and is therefore related
   * to {@link #ENCODING_ISO_8859_1}.
   */
  String ENCODING_CP_850 = "IBM850";

  /**
   * The encoding {@code CP852}. It is used by MS-DOS for Central European languages and is therefore related
   * to {@link #ENCODING_ISO_8859_2}.
   */
  String ENCODING_CP_852 = "IBM852";

  /**
   * The encoding {@code CP855}. It is used by MS-DOS for Cyrillic letters and is therefore related to
   * {@link #ENCODING_ISO_8859_5}.
   */
  String ENCODING_CP_855 = "IBM855";

  /**
   * The encoding {@code CP857}. It is used by MS-DOS for Turkish and is therefore related to
   * {@link #ENCODING_ISO_8859_9}.
   */
  String ENCODING_CP_857 = "IBM857";

  /**
   * The encoding {@code CP857}. It is used by MS-DOS for Western European languages and is like
   * {@link #ENCODING_CP_850} but replaces one character with the euro-sign. It is therefore related to
   * {@link #ENCODING_ISO_8859_15}.
   */
  String ENCODING_CP_858 = "IBM00858";

  /**
   * The encoding {@code CP860}. It is used by MS-DOS for Portuguese and is therefore related to
   * {@link #ENCODING_ISO_8859_1}.
   */
  String ENCODING_CP_860 = "IBM860";

  /**
   * The encoding {@code CP861}. It is used by MS-DOS for Nordic languages especially for Icelandic and is
   * therefore related to {@link #ENCODING_ISO_8859_10}.
   */
  String ENCODING_CP_861 = "IBM861";

  /**
   * The encoding {@code CP863}. It is used by MS-DOS for French and is therefore related to
   * {@link #ENCODING_ISO_8859_15}.
   */
  String ENCODING_CP_863 = "IBM863";

  /**
   * The encoding {@code CP865}. It is used by MS-DOS for Nordic languages except Icelandic for which
   * {@link #ENCODING_CP_861} is used. It is therefore related to {@link #ENCODING_ISO_8859_10}.
   */
  String ENCODING_CP_865 = "IBM865";

  /**
   * The encoding {@code CP866}. It is used by MS-DOS for Cyrillic letters and is therefore related to
   * {@link #ENCODING_CP_855} and {@link #ENCODING_ISO_8859_5}.
   */
  String ENCODING_CP_866 = "IBM866";

  /**
   * The encoding {@code CP869}. It is used by MS-DOS for Greek and is therefore related to
   * {@link #ENCODING_CP_737} and {@link #ENCODING_ISO_8859_7}.
   */
  String ENCODING_CP_869 = "IBM869";

  /**
   * The encoding {@code CP1250} also called {@code Windows-1250}. It is used by Microsoft Windows for Central
   * European languages and is similar to {@link #ENCODING_ISO_8859_2}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_WINDOWS_1250 = "windows-1250";

  /**
   * The encoding {@code CP1251} also called {@code Windows-1251}. It is used by Microsoft Windows for
   * Cyrillic letters and is similar to {@link #ENCODING_ISO_8859_5}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_WINDOWS_1251 = "windows-1251";

  /**
   * The encoding {@code CP1252} also called {@code Windows-1252}. It is used by Microsoft Windows for Western
   * European languages and is similar to {@link #ENCODING_ISO_8859_1}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_WINDOWS_1252 = "windows-1252";

  /**
   * The encoding {@code CP1253} also called {@code Windows-1253}. It is used by Microsoft Windows for Greek
   * and is similar to {@link #ENCODING_ISO_8859_7}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_WINDOWS_1253 = "windows-1253";

  /**
   * The encoding {@code CP1254} also called {@code Windows-1254}. It is used by Microsoft Windows for Turkish
   * and is similar to {@link #ENCODING_ISO_8859_9}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_WINDOWS_1254 = "windows-1254";

  /**
   * The encoding {@code CP1255} also called {@code Windows-1255}. It is used by Microsoft Windows for Hebrew
   * and is similar to {@link #ENCODING_ISO_8859_8}.
   */
  String ENCODING_WINDOWS_1255 = "windows-1255";

  /**
   * The encoding {@code CP1256} also called {@code Windows-1256}. It is used by Microsoft Windows for Arabic
   * and is similar to {@link #ENCODING_ISO_8859_6}.
   */
  String ENCODING_WINDOWS_1256 = "windows-1256";

  /**
   * The encoding {@code CP1257} also called {@code Windows-1257}. It is used by Microsoft Windows for Baltic
   * languages and is similar to {@link #ENCODING_ISO_8859_13}. 

   * This is a basic encoding for Java contained in {@code lib/rt.jar}.
   */
  String ENCODING_WINDOWS_1257 = "windows-1257";

  /**
   * The encoding {@code CP1258} also called {@code Windows-1258}. It is used by Microsoft Windows for
   * Vietnamese and is similar to {@link #ENCODING_WINDOWS_1252}.
   */
  String ENCODING_WINDOWS_1258 = "windows-1258";

  /**
   * This method creates a new {@link java.io.Reader} for the given {@code inputStream}. The
   * {@link EncodingDetectionReader} automatically detects UTF (Unicode Transformation Format) encodings. If
   * the data provided by {@code inputStream} is NOT in such encoding, it will use the given
   * {@code nonUtfEncoding} as fallback. 

   * The {@link EncodingDetectionReader} will behave like {@link java.io.InputStreamReader} but with an
   * encoding that is automatically detected whilst reading. It will use a lookahead buffer to detect the
   * encoding. As long as no UTF characteristic was detected and only ASCII-characters ({@code <128}) are hit,
   * the encoding remains {@link #ENCODING_US_ASCII}. As soon as an UTF sequence was detected (e.g.
   * {@link #ENCODING_UTF_8} or {@link #ENCODING_UTF_16_BE}), the encoding switches to that encoding. If a
   * non-ASCII character is hit and no UTF encoding is detected, the {@link EncodingDetectionReader} switches
   * to the given {@code nonUtfEncoding}.
   *
   * @param inputStream is the {@link InputStream} to decode and read.
   * @param nonUtfEncoding is the encoding to use in case the data is NOT encoded in UTF (e.g.
   *        {@link #ENCODING_ISO_8859_15}). It is pointless to use an UTF-based encoding or
   *        {@link #ENCODING_US_ASCII} here.
   * @return a new {@link EncodingDetectionReader} that can be used to read the {@code inputStream}.
   */
  EncodingDetectionReader createUtfDetectionReader(InputStream inputStream, String nonUtfEncoding);

}