org.apache.axiom.util.stax.dialect.EncodingDetectionHelper Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.axiom.util.stax.dialect;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import javax.xml.stream.XMLStreamException;
/**
* Implements the character encoding autodetection algorithm described in Appendix F.1 of the
* XML 1.0 specifications (Fifth Edition).
*/
class EncodingDetectionHelper {
private final InputStream stream;
private final boolean useMark;
public EncodingDetectionHelper(InputStream stream) {
useMark = stream.markSupported();
if (useMark) {
this.stream = stream;
} else {
this.stream = new PushbackInputStream(stream, 4);
}
}
public InputStream getInputStream() {
return stream;
}
public String detectEncoding() throws XMLStreamException {
byte[] startBytes = new byte[4];
try {
if (useMark) {
stream.mark(4);
}
int read = 0;
do {
int c = stream.read(startBytes, read, 4-read);
if (c == -1) {
throw new XMLStreamException("Unexpected end of stream");
}
read += c;
} while (read < 4);
if (useMark) {
stream.reset();
} else {
((PushbackInputStream)stream).unread(startBytes);
}
} catch (IOException ex) {
throw new XMLStreamException("Unable to read start bytes", ex);
}
int marker = ((startBytes[0] & 0xFF) << 24) + ((startBytes[1] & 0xFF) << 16)
+ ((startBytes[2] & 0xFF) << 8) + (startBytes[3] & 0xFF);
switch (marker) {
case 0x0000FEFF:
case 0xFFFE0000:
case 0x0000FFFE:
case 0xFEFF0000:
case 0x0000003C:
case 0x3C000000:
case 0x00003C00:
case 0x003C0000:
return "UCS-4";
case 0x003C003F:
return "UTF-16BE";
case 0x3C003F00:
return "UTF-16LE";
case 0x3C3F786D:
return "UTF-8";
default:
if ((marker & 0xFFFF0000) == 0xFEFF0000) {
return "UTF-16BE";
} else if ((marker & 0xFFFF0000) == 0xFFFE0000) {
return "UTF-16LE";
} else {
return "UTF-8";
}
}
}
}