All Downloads are FREE. Search and download functionalities are using the official Maven repository.

groovy.util.CharsetToolkit Maven / Gradle / Ivy

There is a newer version: 1.5.8
Show newest version
/*
 * Copyright 2003-2007 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package groovy.util;

import java.io.*;
import java.nio.charset.Charset;
import java.util.Collection;

/**
 * 

Utility class to guess the encoding of a given text file.

* *

Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer * is wide enough, the charset should also be discovered.

* *

A byte buffer of 4KB is usually sufficient to be able to guess the encoding.

* *

Usage:

*
 * // guess the encoding
 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
 *
 * // create a reader with the correct charset
 * CharsetToolkit toolkit = new CharsetToolkit(file);
 * BufferedReader reader = toolkit.getReader();
 *
 * // read the file content
 * String line;
 * while ((line = br.readLine())!= null)
 * {
 *     System.out.println(line);
 * }
 * 
* * @author Guillaume Laforge */ public class CharsetToolkit { private byte[] buffer; private Charset defaultCharset; private Charset charset; private boolean enforce8Bit = true; private final File file; private static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; /** * Constructor of the CharsetToolkit utility class. * * @param file of which we want to know the encoding. */ public CharsetToolkit(File file) throws IOException { this.file = file; this.defaultCharset = getDefaultSystemCharset(); this.charset = null; InputStream input = new FileInputStream(file); try { byte[] bytes = new byte[4096]; int bytesRead = input.read(bytes); if (bytesRead == -1) { this.buffer = EMPTY_BYTE_ARRAY; } else if (bytesRead < 4096) { byte[] bytesToGuess = new byte[bytesRead]; System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); this.buffer = bytesToGuess; } else { this.buffer = bytes; } } finally { try {input.close();} catch (IOException e){ // IGNORE } } } /** * Defines the default Charset used in case the buffer represents * an 8-bit Charset. * * @param defaultCharset the default Charset to be returned by guessEncoding() * if an 8-bit Charset is encountered. */ public void setDefaultCharset(Charset defaultCharset) { if (defaultCharset != null) this.defaultCharset = defaultCharset; else this.defaultCharset = getDefaultSystemCharset(); } public Charset getCharset() { if (this.charset == null) this.charset = guessEncoding(); return charset; } /** * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. * It might be a file without any special character in the range 128-255, but that may be or become * a file encoded with the default charset rather than US-ASCII. * * @param enforce a boolean specifying the use or not of US-ASCII. */ public void setEnforce8Bit(boolean enforce) { this.enforce8Bit = enforce; } /** * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding. * * @return a boolean representing the flag of use of US-ASCII. */ public boolean getEnforce8Bit() { return this.enforce8Bit; } /** * Retrieves the default Charset */ public Charset getDefaultCharset() { return defaultCharset; } /** *

Guess the encoding of the provided buffer.

* If Byte Order Markers are encountered at the beginning of the buffer, we immidiately * return the charset implied by this BOM. Otherwise, the file would not be a human * readable text file.

* *

If there is no BOM, this method tries to discern whether the file is UTF-8 or not. * If it is not UTF-8, we assume the encoding is the default system encoding * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).

* *

It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.

*
     * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
     * 0000 0000-0000 007F       0xxxxxxx
     * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
     * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
     * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 
*

With UTF-8, 0xFE and 0xFF never appear.

* * @return the Charset recognized. */ private Charset guessEncoding() { // if the file has a Byte Order Marker, we can assume the file is in UTF-xx // otherwise, the file would not be human readable if (hasUTF8Bom()) return Charset.forName("UTF-8"); if (hasUTF16LEBom()) return Charset.forName("UTF-16LE"); if (hasUTF16BEBom()) return Charset.forName("UTF-16BE"); // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding // otherwise, the file is in US-ASCII boolean highOrderBit = false; // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid // if it's not the case, we can assume the encoding is the default encoding of the system boolean validU8Char = true; // TODO the buffer is not read up to the end, but up to length - 6 int length = buffer.length; int i = 0; while (i < length - 6) { byte b0 = buffer[i]; byte b1 = buffer[i + 1]; byte b2 = buffer[i + 2]; byte b3 = buffer[i + 3]; byte b4 = buffer[i + 4]; byte b5 = buffer[i + 5]; if (b0 < 0) { // a high order bit was encountered, thus the encoding is not US-ASCII // it may be either an 8-bit encoding or UTF-8 highOrderBit = true; // a two-bytes sequence was encoutered if (isTwoBytesSequence(b0)) { // there must be one continuation byte of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!isContinuationChar(b1)) validU8Char = false; else i++; } // a three-bytes sequence was encoutered else if (isThreeBytesSequence(b0)) { // there must be two continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2))) validU8Char = false; else i += 2; } // a four-bytes sequence was encoutered else if (isFourBytesSequence(b0)) { // there must be three continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) validU8Char = false; else i += 3; } // a five-bytes sequence was encoutered else if (isFiveBytesSequence(b0)) { // there must be four continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) validU8Char = false; else i += 4; } // a six-bytes sequence was encoutered else if (isSixBytesSequence(b0)) { // there must be five continuation bytes of the form 10xxxxxx, // otherwise the following characteris is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5))) validU8Char = false; else i += 5; } else validU8Char = false; } if (!validU8Char) break; i++; } // if no byte with an high order bit set, the encoding is US-ASCII // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) if (!highOrderBit) { // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. if (this.enforce8Bit) return this.defaultCharset; else return Charset.forName("US-ASCII"); } // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, // otherwise the file would not be human readable if (validU8Char) return Charset.forName("UTF-8"); // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding return this.defaultCharset; } /** * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; * * @param b a byte. * @return true if it's a continuation char. */ private static boolean isContinuationChar(byte b) { return -128 <= b && b <= -65; } /** * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a two-bytes sequence. */ private static boolean isTwoBytesSequence(byte b) { return -64 <= b && b <= -33; } /** * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a three-bytes sequence. */ private static boolean isThreeBytesSequence(byte b) { return -32 <= b && b <= -17; } /** * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a four-bytes sequence. */ private static boolean isFourBytesSequence(byte b) { return -16 <= b && b <= -9; } /** * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a five-bytes sequence. */ private static boolean isFiveBytesSequence(byte b) { return -8 <= b && b <= -5; } /** * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. * * @param b a byte. * @return true if it's the first byte of a six-bytes sequence. */ private static boolean isSixBytesSequence(byte b) { return -4 <= b && b <= -3; } /** * Retrieve the default charset of the system. * * @return the default Charset. */ public static Charset getDefaultSystemCharset() { return Charset.forName(System.getProperty("file.encoding")); } /** * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). * * @return true if the buffer has a BOM for UTF8. */ public boolean hasUTF8Bom() { if (buffer.length >= 3) return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65); else return false; } /** * Has a Byte Order Marker for UTF-16 Low Endian * (ucs-2le, ucs-4le, and ucs-16le). * * @return true if the buffer has a BOM for UTF-16 Low Endian. */ public boolean hasUTF16LEBom() { if (buffer.length >= 2) return (buffer[0] == -1 && buffer[1] == -2); else return false; } /** * Has a Byte Order Marker for UTF-16 Big Endian * (utf-16 and ucs-2). * * @return true if the buffer has a BOM for UTF-16 Big Endian. */ public boolean hasUTF16BEBom() { if (buffer.length >= 2) return (buffer[0] == -2 && buffer[1] == -1); else return false; } /** * Gets a BufferedReader (indeed a LineNumberReader) from the File * specified in the constructor of CharsetToolkit using the charset discovered by the * method guessEncoding(). * * @return a BufferedReader * @throws FileNotFoundException if the file is not found. */ public BufferedReader getReader() throws FileNotFoundException { LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) { try { reader.read(); } catch (IOException e) { // should never happen, as a file with no content // but with a BOM has at least one char } } return reader; } /** * Retrieves all the available Charsets on the platform, * among which the default charset. * * @return an array of Charsets. */ public static Charset[] getAvailableCharsets() { Collection collection = Charset.availableCharsets().values(); return (Charset[]) collection.toArray(new Charset[collection.size()]); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy