net.sf.mmm.util.io.base.EncodingUtilImpl Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mmm-util-io Show documentation
Show all versions of mmm-util-io Show documentation
Utilities for input/output and streaming.
The newest version!
/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0
* http://www.apache.org/licenses/LICENSE-2.0 */
package net.sf.mmm.util.io.base;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Locale;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import net.sf.mmm.util.component.base.AbstractComponent;
import net.sf.mmm.util.io.api.ByteOrderMark;
import net.sf.mmm.util.io.api.ByteProcessor;
import net.sf.mmm.util.io.api.EncodingDetectionReader;
import net.sf.mmm.util.io.api.EncodingUtil;
import net.sf.mmm.util.io.api.ProcessableByteArrayBuffer;
import net.sf.mmm.util.io.impl.BufferInputStream;
/**
* This is the implementation of the {@link EncodingUtil} interface.
*
* @see #getInstance()
*
* @author Joerg Hohwiller (hohwille at users.sourceforge.net)
* @since 1.0.1
*/
public class EncodingUtilImpl extends AbstractComponent implements EncodingUtil {
private static final Logger LOG = LoggerFactory.getLogger(EncodingUtilImpl.class);
/**
* In an UTF-8 multi-byte-sequence all bytes except the first one have the from {@code 10xxxxxx}. This is the lower
* bound to detect such char.
*/
public static final byte UTF_8_CONTINUATION_BYTE_MIN = (byte) 0x80;
/**
* In an UTF-8 multi-byte-sequence all bytes except the first one have the from {@code 10xxxxxx}. This is the upper
* bound to detect such char.
*/
public static final byte UTF_8_CONTINUATION_BYTE_MAX = (byte) 0xBF;
/**
* An UTF-8 two-byte-sequence has the form {@code 110xxxxx 10xxxxxx}. This is the lower bound to detect the first char
* of such sequence.
* ATTENTION:
* The bytes {@code 0xC0} or {@code 0xC1} would indicate a two-byte-sequence with code-point <= 127 what makes no
* sense.
*/
public static final byte UTF_8_TWO_BYTE_MIN = (byte) 0xC2;
/**
* An UTF-8 two-byte-sequence has the form {@code 110xxxxx 10xxxxxx}. This is the upper bound to detect the first char
* of such sequence.
*/
public static final byte UTF_8_TWO_BYTE_MAX = (byte) 0xDF;
/**
* An UTF-8 thee-byte-sequence has the form {@code 1110xxxx 10xxxxxx 10xxxxxx}. This is the lower bound to detect the
* first char of such sequence.
*/
public static final byte UTF_8_THREE_BYTE_MIN = (byte) 0xE0;
/**
* An UTF-8 thee-byte-sequence has the form {@code 1110xxxx 10xxxxxx 10xxxxxx}. This is the upper bound to detect the
* first char of such sequence.
*/
public static final byte UTF_8_THREE_BYTE_MAX = (byte) 0xEF;
/**
* An UTF-8 four-byte-sequence has the form {@code 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx}. This is the lower bound to
* detect the first char of such sequence.
*/
public static final byte UTF_8_FOUR_BYTE_MIN = (byte) 0xF0;
/**
* An UTF-8 four-byte-sequence has the form {@code 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx}. This is the upper bound to
* detect the first char of such sequence.
* ATTENTION:
* The bytes {@code 0xF5}, {@code 0xF6}, or {@code 0xF7} would lead to a four-byte-sequence with code-point greater
* than {@code 10FFFF} which is restricted by rfc3629.
*/
public static final byte UTF_8_FOUR_BYTE_MAX = (byte) 0xF4;
/**
* An UTF-16 four-byte-sequence consists of 2 two-byte-sequences called surrogate. The first has the form
* {@code 110110xx xxxxxxxx}. This is the lower bound to detect the first char of such sequence.
*/
public static final byte UTF_16_FIRST_SURROGATE_MIN = (byte) 0xD8;
/**
* An UTF-16 four-byte-sequence consists of 2 two-byte-sequences called surrogate. The first has the form
* {@code 110110xx xxxxxxxx}. This is the upper bound to detect the first char of such sequence.
*/
public static final byte UTF_16_FIRST_SURROGATE_MAX = (byte) 0xDB;
/**
* An UTF-16 four-byte-sequence consists of 2 two-byte-sequences called surrogate. The second has the form
* {@code 110111xx xxxxxxxx}. This is the lower bound to detect the first char of such sequence.
*/
public static final byte UTF_16_SECOND_SURROGATE_MIN = (byte) 0xDC;
/**
* An UTF-16 four-byte-sequence consists of 2 two-byte-sequences called surrogate. The second has the form
* {@code 110111xx xxxxxxxx}. This is the upper bound to detect the first char of such sequence.
*/
public static final byte UTF_16_SECOND_SURROGATE_MAX = (byte) 0xDF;
/** The rank gain if a proper {@link ByteOrderMark} was detected. */
private static final int RANK_BOM = 20;
/** The rank gain if a proper UTF-8 multi-byte sequence was detected. */
private static final int RANK_UTF8_SEQUNCE = 10;
/** The rank gain if an UTF-16 surrogate pair was detected. */
private static final int RANK_UTF16_SURROGATE = 6;
private static EncodingUtil instance;
/**
* The constructor.
*/
public EncodingUtilImpl() {
super();
}
/**
* This method gets the singleton instance of this {@link EncodingUtilImpl}.
* ATTENTION:
* Please prefer dependency-injection instead of using this method.
*
* @return the singleton instance.
*/
public static EncodingUtil getInstance() {
if (instance == null) {
synchronized (EncodingUtilImpl.class) {
if (instance == null) {
EncodingUtilImpl util = new EncodingUtilImpl();
util.initialize();
instance = util;
}
}
}
return instance;
}
@Override
public EncodingDetectionReader createUtfDetectionReader(InputStream inputStream, String nonUtfEncoding) {
String encoding = nonUtfEncoding;
if (encoding == null) {
encoding = Charset.defaultCharset().name();
String enc = encoding.toLowerCase(Locale.US);
if ((enc.startsWith("utf")) || (enc.endsWith("ascii"))) {
encoding = ENCODING_ISO_8859_1;
}
} else {
String enc = encoding.toLowerCase(Locale.US);
if ((enc.startsWith("utf")) || (enc.endsWith("ascii"))) {
LOG.info("using encoding '" + encoding + "' for 'nonUtfEncoding' does NOT really make sense.");
}
}
return new UtfDetectionReader(inputStream, encoding);
}
/**
* This enum contains represents the type of a {@link Surrogate} from an UTF-16 sequence.
*/
protected static enum Surrogate {
/** The first, most significant surrogate. Starts with byte {@code } */
FIRST,
/** The second, least significant surrogate. */
SECOND;
}
/**
* This inner class is used to process the byes from the underlying {@link InputStream} in ASCII mode. It is used as
* long as no other encoding has been detected.
*/
protected static class AsciiProcessor implements ByteProcessor {
/** The character-buffer to fill by the reader. Will be used in ASCII mode. */
private char[] charBuffer;
/** The current index in {@link #charBuffer}. */
private int charOffset;
/**
* The constructor.
*/
public AsciiProcessor() {
super();
this.charBuffer = null;
this.charOffset = 0;
}
@Override
public int process(byte[] buffer, int offset, int length) {
int len = offset + length;
for (int i = offset; i < len; i++) {
this.charBuffer[this.charOffset++] = (char) buffer[i];
}
return length;
}
}
/**
* This inner class is used to perform the actual UTF detection. It processes the bytes from the underlying
* {@link InputStream} from a lookahead buffer. It respects a {@link ByteOrderMark}, UTF-8 multi-byte-sequences,
* UTF-16 surrogates, zero-bytes for UTF-16 and UTF-32 ASCII overhead, etc.
*/
protected static class UtfDetectionProcessor implements ByteProcessor {
/** The {@link RankMap} for encoding detection. */
private RankMap encodingRankMap;
/**
* The {@link ByteOrderMark} or {@code null} if NOT present (or detection NOT started).
*/
private ByteOrderMark bom;
/** The encoding to use if encoding is neither UTF nor ASCII. */
private final String nonUtfEncoding;
/**
* {@code false} if the data can NOT be ASCII, {@code true} otherwise.
*/
private boolean maybeAscii;
/**
* {@code false} if the data can NOT be UTF-8, {@code true} otherwise.
*/
private boolean maybeUtf8;
/**
* {@code false} if the data can NOT be UTF-16, {@code true} otherwise.
*/
private boolean maybeUtf16;
/** The byte-position in the stream relative to the head. */
private long bytePosition;
/** The {@link #bytePosition} where the first non-ascii byte was detected. */
private long firstNonAsciiPosition;
/**
* The number of bytes that have been {@code 0} for each of the {@link #bytePosition positions} modulo 4.
*/
private int[] zeroByteCounts;
/**
* The last {@link Surrogate}s for each of the {@link #bytePosition positions} modulo 4.
*/
private Surrogate[] surrogates;
/**
* The expected number of UTF-8 continuation bytes to come or {@code 0} if no UTF-8 multi-byte-sequence is currently
* processed.
*/
private int utf8ContinuationByteCount;
/**
* The constructor.
*
* @param nonUtfEncoding is the encoding to use if encoding is neither UTF nor ASCII.
*/
public UtfDetectionProcessor(String nonUtfEncoding) {
super();
this.nonUtfEncoding = nonUtfEncoding;
this.zeroByteCounts = new int[4];
this.surrogates = new Surrogate[4];
this.encodingRankMap = new RankMap<>();
this.maybeAscii = true;
this.maybeUtf8 = true;
this.maybeUtf16 = true;
}
@Override
public int process(byte[] buffer, int offset, int length) {
int len = offset + length;
for (int i = offset; i < len; i++) {
byte b = buffer[i];
if (b < 0) {
// non ASCII character detected
if (this.maybeAscii) {
this.maybeAscii = false;
this.firstNonAsciiPosition = this.bytePosition;
this.encodingRankMap.setUnacceptable(ENCODING_US_ASCII);
}
if (this.bytePosition == 0) {
i = processBom(buffer, offset, i);
}
if (this.maybeUtf8) {
processUtf8Detection(b);
}
if (this.maybeUtf16) {
processUtf16Detection(b);
}
} else {
// potential ASCII character
if (this.utf8ContinuationByteCount > 0) {
this.utf8ContinuationByteCount = 0;
this.maybeUtf8 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_8);
}
if (b == 0) {
int modulo4 = (int) (this.bytePosition & 3);
this.zeroByteCounts[modulo4]++;
}
}
this.bytePosition++;
}
return length;
}
/**
* Heuristic analysis to detect UTF-16 indications.
*
* @param b is the single byte to process.
*/
private void processUtf16Detection(byte b) {
int modulo4 = (int) (this.bytePosition & 3);
this.surrogates[modulo4] = null;
if ((b >= UTF_16_FIRST_SURROGATE_MIN) && (b <= UTF_16_SECOND_SURROGATE_MAX)) {
if (b <= UTF_16_FIRST_SURROGATE_MAX) {
this.surrogates[modulo4] = Surrogate.FIRST;
} else {
this.surrogates[modulo4] = Surrogate.SECOND;
}
int last = (modulo4 - 2) & 3;
if (this.surrogates[last] != null) {
if (this.surrogates[last] == this.surrogates[modulo4]) {
// duplicate surrogate high-byte --> can NOT be any UTF-16*
this.maybeUtf16 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_16_LE);
this.encodingRankMap.setUnacceptable(ENCODING_UTF_16_BE);
} else {
if ((modulo4 & 1) == 0) {
this.encodingRankMap.addRank(ENCODING_UTF_16_BE, RANK_UTF16_SURROGATE);
} else {
this.encodingRankMap.addRank(ENCODING_UTF_16_LE, RANK_UTF16_SURROGATE);
}
}
}
}
}
/**
* Heuristic analysis to detect UTF-8 indications.
*
* @param b is the single byte to process.
*/
private void processUtf8Detection(byte b) {
if (this.utf8ContinuationByteCount > 0) {
if ((b >= UTF_8_CONTINUATION_BYTE_MIN) && (b <= UTF_8_CONTINUATION_BYTE_MAX)) {
this.utf8ContinuationByteCount--;
if (this.utf8ContinuationByteCount == 0) {
this.encodingRankMap.addRank(ENCODING_UTF_8, RANK_UTF8_SEQUNCE);
}
} else {
this.utf8ContinuationByteCount = 0;
this.maybeUtf8 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_8);
}
} else {
if ((b >= UTF_8_TWO_BYTE_MIN) && (b <= UTF_8_TWO_BYTE_MAX)) {
// 110xxxxx --> UTF-8 two-byte sequence?
this.utf8ContinuationByteCount = 1;
} else if ((b >= UTF_8_THREE_BYTE_MIN) && (b <= UTF_8_THREE_BYTE_MAX)) {
// 1110xxxx --> UTF-8 three-byte sequence?
this.utf8ContinuationByteCount = 2;
} else if ((b >= UTF_8_FOUR_BYTE_MIN) && (b <= UTF_8_FOUR_BYTE_MAX)) {
// 1110xxxx --> UTF-8 three-byte sequence?
this.utf8ContinuationByteCount = 3;
} else {
this.maybeUtf8 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_8);
}
}
}
/**
* Detects if a {@link ByteOrderMark} (BOM) is available as hint for the encoding.
*
* @param buffer is the buffer of the raw data.
* @param offset is the current offset
* @param i is the current index.
* @return the new index. Will be the same as {@code i} or greater if bytes (for detected BOM) have been consumed.
*/
private int processBom(byte[] buffer, int offset, int i) {
// first read - try to detect encoding by BOM...
int resultIndex = i;
this.bom = ByteOrderMark.detect(buffer, offset);
if (this.bom != null) {
String encoding = this.bom.getEncoding();
this.encodingRankMap.addRank(encoding, RANK_BOM);
switch (this.bom) {
case UTF_8:
this.maybeUtf16 = false;
break;
case UTF_16_BE:
this.maybeUtf8 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_16_LE);
break;
case UTF_16_LE:
this.maybeUtf8 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_16_BE);
break;
case UTF_32_BE:
this.maybeUtf8 = false;
this.maybeUtf16 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_32_LE);
break;
case UTF_32_LE:
this.maybeUtf8 = false;
this.maybeUtf16 = false;
this.encodingRankMap.setUnacceptable(ENCODING_UTF_32_BE);
break;
default :
// nothing to do...
}
int add = this.bom.getLength() - 1;
resultIndex = resultIndex + add;
this.bytePosition = add;
}
return resultIndex;
}
/**
* This method gets the encoding without taking high-bytes (non-ASCII) into account.
*
* @return the low-byte encoding or {@code null} if it looks like ASCII so far.
*/
public String getLowByteEncoding() {
int evenZeroCount = this.zeroByteCounts[0] + this.zeroByteCounts[2];
int oddZeroCount = this.zeroByteCounts[1] + this.zeroByteCounts[3];
int zeroCount = evenZeroCount + oddZeroCount;
if (zeroCount > 0) {
// will ASCII files contain zero bytes???
if (this.maybeUtf16) {
if (evenZeroCount == 0) {
return ENCODING_UTF_16_LE;
}
if (oddZeroCount == 0) {
return ENCODING_UTF_16_BE;
}
}
int highZeroCount = this.zeroByteCounts[0] + this.zeroByteCounts[1];
if (highZeroCount == 0) {
return ENCODING_UTF_32_LE;
}
int lowZeroCount = this.zeroByteCounts[2] + this.zeroByteCounts[3];
if (lowZeroCount == 0) {
return ENCODING_UTF_32_BE;
}
}
return null;
}
/**
* This method gets the detected encoding from the currently processed data.
*
* @return the detected encoding or {@code null} if the encoding has NOT yet been detected and it looks like ASCII
* so far.
*/
public String getEncoding() {
String encoding;
if (this.maybeAscii) {
encoding = getLowByteEncoding();
} else {
encoding = this.encodingRankMap.getBest();
if (encoding == null) {
encoding = this.nonUtfEncoding;
}
}
return encoding;
}
}
/**
* @see EncodingUtilImpl#createUtfDetectionReader(InputStream, String)
*/
protected class UtfDetectionReader extends EncodingDetectionReader {
/** The required lookahead size. */
private static final int REQUIRED_LOOKAHEAD = 1024;
/** The used buffer size. */
private static final int BUFFER_SIZE = REQUIRED_LOOKAHEAD * 2;
/** The input-stream to read. */
private final BufferInputStream inputStream;
/** The processor for ASCII-mode. */
private final AsciiProcessor asciiProcessor;
/** The processor to detect encoding by lookahead. */
private final UtfDetectionProcessor detectionProcessor;
/** The lookahead buffer used to detect encoding. */
private final ProcessableByteArrayBuffer detectionBuffer;
private String encoding;
/**
* The {@link Reader} to delegate to. Will be {@code null} until the first non ASCII-Character is detected.
*/
private Reader reader;
/** The number of ASCII bytes available to read from {@link #inputStream}. */
private int asciiBytesAvailable;
/** {@code true} if end of stream is reached. */
private boolean eos;
/**
* The constructor.
*
* @param inputStream is the {@link InputStream} to read.
* @param nonUtfEncoding is the encoding to use as fallback if non-ASCII characters are detected that are NOT
* encoded in UTF.
*/
public UtfDetectionReader(InputStream inputStream, String nonUtfEncoding) {
super();
this.inputStream = new BufferInputStream(inputStream, BUFFER_SIZE);
if (nonUtfEncoding == null) {
throw new NullPointerException();
}
this.asciiProcessor = new AsciiProcessor();
this.detectionProcessor = new UtfDetectionProcessor(nonUtfEncoding);
this.detectionBuffer = this.inputStream.createLookaheadBuffer();
}
@Override
public String getEncoding() {
if ((this.encoding == null) && this.eos) {
return ENCODING_US_ASCII;
}
return this.encoding;
}
@Override
public void close() throws IOException {
this.inputStream.close();
if (this.reader != null) {
this.reader.close();
}
}
@Override
public int read(char[] buffer, int offset, int length) throws IOException {
int offPlusLen = offset + length;
if ((offset < 0) || (length < 0) || (offPlusLen < 0) || (buffer.length < offPlusLen)) {
throw new IndexOutOfBoundsException();
} else if (length == 0) {
return 0;
}
int bytesRead;
if (this.reader == null) {
// prevent modifying parameters
int off = offset;
int lengthRest = length;
// start detection
while (lengthRest > 0) {
if (this.asciiBytesAvailable == 0) {
// here we either need to detect the encoding or determine some
// number of next bytes that are ensured to be ASCII...
// refill our buffer...
this.eos = this.inputStream.fill();
if (this.detectionBuffer.hasNext()) {
int lookahead = (int) this.detectionBuffer.process(this.detectionProcessor, Integer.MAX_VALUE);
if ((!this.eos) && (!this.detectionProcessor.maybeAscii)) {
int nonAsciiOffset = (int) (this.detectionProcessor.bytePosition - this.detectionProcessor.firstNonAsciiPosition);
if (nonAsciiOffset < REQUIRED_LOOKAHEAD) {
this.encoding = this.detectionProcessor.getLowByteEncoding();
if (this.encoding == null) {
// seems to be ASCII until some byte, but not enough
// lookahead to determine encoding after that byte -> empty
// ASCII bytes from buffer and refill via loop.
this.asciiBytesAvailable = lookahead - nonAsciiOffset - 1;
}
}
}
if (this.asciiBytesAvailable == 0) {
this.encoding = this.detectionProcessor.getEncoding();
if (this.encoding == null) {
// ASCII so far...
this.asciiBytesAvailable = lookahead;
}
}
} else {
assert (this.eos);
break;
}
}
if (this.encoding == null) {
assert (this.asciiBytesAvailable > 0);
this.asciiProcessor.charBuffer = buffer;
this.asciiProcessor.charOffset = off;
int asciiCount = this.asciiBytesAvailable;
if (asciiCount > lengthRest) {
asciiCount = lengthRest;
}
int asciiRead = (int) this.inputStream.process(this.asciiProcessor, asciiCount);
if (asciiRead == 0) {
break;
}
this.asciiBytesAvailable = this.asciiBytesAvailable - asciiRead;
lengthRest = lengthRest - asciiRead;
off = off + asciiRead;
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("detected encoding '" + this.encoding + "'");
}
this.reader = new InputStreamReader(this.inputStream, this.encoding);
return this.reader.read(buffer, off, lengthRest);
}
}
bytesRead = length - lengthRest;
if (bytesRead == 0) {
assert (this.eos);
return -1;
}
} else {
bytesRead = this.reader.read(buffer, offset, length);
}
return bytesRead;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy