![JAR search and dependency download from the Maven repository](/logo.png)
net.segoia.util.data.encoding.UnicodeUtil Maven / Gradle / Ivy
The newest version!
/**
* commons - Various Java Utils
* Copyright (C) 2009 Adrian Cristian Ionescu - https://github.com/acionescu
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.segoia.util.data.encoding;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
public class UnicodeUtil {
public static byte[] convert(byte[] bytes, String encout) throws Exception {
// Workaround for bug that will not be fixed by SUN
// http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4508058
UnicodeInputStream uis = new UnicodeInputStream(new ByteArrayInputStream(bytes), "ASCII");
boolean unicodeOutputReqd = (getBOM(encout) != null) ? true : false;
String enc = uis.getEncoding();
String BOM = getBOM(enc); // get the BOM of the inputstream
if (BOM == null) {
// inputstream looks like ascii...
// create a BOM based on the outputstream
BOM = getBOM(encout);
}
uis.close();
ByteArrayOutputStream out = new ByteArrayOutputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(bytes,
uis.getBOMOffset(), bytes.length), enc));
Writer w = new BufferedWriter(new OutputStreamWriter(out, encout));
// dont write a BOM for ascii(out) as the OutputStreamWriter
// will not process it correctly.
if (BOM != null && unicodeOutputReqd) {
w.write(BOM);
}
char[] buffer = new char[4096];
int len;
while ((len = br.read(buffer)) != -1) {
w.write(buffer, 0, len);
}
br.close(); // Close the input.
w.close(); // Flush and close output.
return out.toByteArray();
}
public static String getBOM(String enc) throws UnsupportedEncodingException {
if ("UTF-8".equals(enc)) {
byte[] bom = new byte[3];
bom[0] = (byte) 0xEF;
bom[1] = (byte) 0xBB;
bom[2] = (byte) 0xBF;
return new String(bom, enc);
} else if ("UTF-16BE".equals(enc)) {
byte[] bom = new byte[2];
bom[0] = (byte) 0xFE;
bom[1] = (byte) 0xFF;
return new String(bom, enc);
} else if ("UTF-16LE".equals(enc)) {
byte[] bom = new byte[2];
bom[0] = (byte) 0xFF;
bom[1] = (byte) 0xFE;
return new String(bom, enc);
} else if ("UTF-32BE".equals(enc)) {
byte[] bom = new byte[4];
bom[0] = (byte) 0x00;
bom[1] = (byte) 0x00;
bom[2] = (byte) 0xFE;
bom[3] = (byte) 0xFF;
return new String(bom, enc);
} else if ("UTF-32LE".equals(enc)) {
byte[] bom = new byte[4];
bom[0] = (byte) 0x00;
bom[1] = (byte) 0x00;
bom[2] = (byte) 0xFF;
bom[3] = (byte) 0xFE;
return new String(bom, enc);
} else {
return null;
}
}
public static class UnicodeInputStream extends InputStream {
private PushbackInputStream internalIn;
private boolean isInited = false;
private int BOMOffset = -1;
private String defaultEnc;
private String encoding;
public static final int BOM_SIZE = 4;
public UnicodeInputStream(InputStream in, String defaultEnc) {
internalIn = new PushbackInputStream(in, BOM_SIZE);
this.defaultEnc = defaultEnc;
}
public String getDefaultEncoding() {
return defaultEnc;
}
public String getEncoding() {
if (!isInited) {
try {
init();
} catch (IOException ex) {
IllegalStateException ise = new IllegalStateException("Init method failed.");
ise.initCause(ise);
throw ise;
}
}
return encoding;
}
/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are unread back to the stream, only BOM bytes are
* skipped.
*/
protected void init() throws IOException {
if (isInited)
return;
byte bom[] = new byte[BOM_SIZE];
int n, unread;
n = internalIn.read(bom, 0, bom.length);
if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE)
&& (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00)
&& (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else {
// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;
}
BOMOffset = BOM_SIZE - unread;
if (unread > 0)
internalIn.unread(bom, (n - unread), unread);
isInited = true;
}
public void close() throws IOException {
// init();
isInited = true;
internalIn.close();
}
public int read() throws IOException {
// init();
isInited = true;
return internalIn.read();
}
public int getBOMOffset() {
return BOMOffset;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy