net.sf.mmm.util.io.api.ByteOrderMark Maven / Gradle / Ivy
/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0
* http://www.apache.org/licenses/LICENSE-2.0 */
package net.sf.mmm.util.io.api;
/**
* This type represents a Byte-Order-Mark (BOM) of an
* Unicode-Transformation-Format (UTF).
*
* @author Joerg Hohwiller (hohwille at users.sourceforge.net)
* @since 1.0.1
*/
public enum ByteOrderMark {
/**
* The {@link ByteOrderMark} for {@link EncodingUtil#ENCODING_UTF_8 UTF-8}:
* {@code 0xef 0xbb 0xbf}
*/
UTF_8() {
@Override
public String getEncoding() {
return EncodingUtil.ENCODING_UTF_8;
}
@Override
public byte[] getBytes() {
return MAGIC_BYTES_UTF8;
}
},
/**
* The {@link ByteOrderMark} for {@link EncodingUtil#ENCODING_UTF_16_BE UTF-16BE}:
* {@code 0xfe 0xff}
*/
UTF_16_BE() {
@Override
public String getEncoding() {
return EncodingUtil.ENCODING_UTF_16_BE;
}
@Override
public byte[] getBytes() {
return MAGIC_BYTES_UTF16_BE;
}
},
/**
* The {@link ByteOrderMark} for {@link EncodingUtil#ENCODING_UTF_16_LE UTF16-LE}:
* {@code 0xff 0xfe}
*/
UTF_16_LE() {
@Override
public String getEncoding() {
return EncodingUtil.ENCODING_UTF_16_LE;
}
@Override
public byte[] getBytes() {
return MAGIC_BYTES_UTF16_LE;
}
},
/**
* The {@link ByteOrderMark} for {@link EncodingUtil#ENCODING_UTF_32_BE UTF-32BE}:
* {@code 0x00 0x00 0xfe 0xff}
*/
UTF_32_BE() {
@Override
public String getEncoding() {
return EncodingUtil.ENCODING_UTF_32_BE;
}
@Override
public byte[] getBytes() {
return MAGIC_BYTES_UTF32_BE;
}
},
/**
* The {@link ByteOrderMark} for {@link EncodingUtil#ENCODING_UTF_32_LE UTF-32LE}:
* {@code 0xff 0xfe 0x00 0x00}
*/
UTF_32_LE() {
@Override
public String getEncoding() {
return EncodingUtil.ENCODING_UTF_32_LE;
}
@Override
public byte[] getBytes() {
return MAGIC_BYTES_UTF32_LE;
}
};
private static final byte[] MAGIC_BYTES_UTF8 = new byte[] { (byte) 0xef, (byte) 0xbb, (byte) 0xbf };
private static final byte[] MAGIC_BYTES_UTF16_BE = new byte[] { (byte) 0xfe, (byte) 0xff };
private static final byte[] MAGIC_BYTES_UTF16_LE = new byte[] { (byte) 0xff, (byte) 0xfe };
private static final byte[] MAGIC_BYTES_UTF32_BE = new byte[] { 0x00, 0x00, (byte) 0xfe, (byte) 0xff };
private static final byte[] MAGIC_BYTES_UTF32_LE = new byte[] { (byte) 0xff, (byte) 0xfe, 0x00, 0x00 };
/**
* This method gets the encoding indicated by this {@link ByteOrderMark}.
*
* @return the encoding.
*/
public abstract String getEncoding();
/**
* This method gets the number of bytes of this {@link ByteOrderMark}.
*
* @return the length.
*/
public final int getLength() {
return getBytes().length;
}
/**
* This method detects if this {@link ByteOrderMark} is present in the given {@code bytes}.
* NOTE:
* A BOM may only occur at the head of your data (file, payload, etc.).
* ATTENTION:
* Please note that binary data may accidently have header bytes that represent this {@link ByteOrderMark}. This
* method can NOT know this and will return {@code true} even if the data is NOT encoded with the
* {@link #getEncoding() according encoding}. Therefore you should only use this method for the header of textual
* data.
*
* @param bytes is the buffer with the bytes to check.
* @param offset is the index of the first data-byte in {@code bytes}. Will typically be {@code 0} .
* @return {@code true} if this {@link ByteOrderMark BOM} was detected in the
*/
public final boolean isPresent(byte[] bytes, int offset) {
byte[] bom = getBytes();
if (offset + bom.length <= bytes.length) {
for (int i = 0; i < bom.length; i++) {
if (bytes[offset + i] != bom[i]) {
return false;
}
}
return true;
}
return false;
}
/**
* This method gets the bytes of this BOM.
*
* @return the magic bytes of this BOM.
*/
protected abstract byte[] getBytes();
/**
* This method detects the {@link ByteOrderMark} that may be {@link #isPresent(byte[], int) present} in the given
* {@code bytes} starting at {@code offset}.
* ATTENTION:
* Please note that binary data may accidently have header bytes that represent a {@link ByteOrderMark}. This method
* can NOT know this and will return that {@link ByteOrderMark} even if the data is NOT encoded with the
* {@link #getEncoding() according encoding}. Therefore you should only use this method for the header of textual
* data.
*
* @param bytes is the buffer with the bytes to check.
* @param offset is the index of the first data-byte in {@code bytes}. Will typically be {@code 0} .
* @return the detected {@link ByteOrderMark} or {@code null} if the given {@code bytes} have no BOM.
*/
public static ByteOrderMark detect(byte[] bytes, int offset) {
for (ByteOrderMark bom : values()) {
if (bom.isPresent(bytes, offset)) {
return bom;
}
}
return null;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy