net.htmlparser.jericho.StreamEncodingDetector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html Show documentation
Show all versions of jericho-html Show documentation
Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.3
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
import java.nio.charset.*;
import java.net.*;
/**
* Based on information in:
* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
* http://www.w3.org/TR/html401/charset.html#h-5.2
*/
final class StreamEncodingDetector {
private final InputStream inputStream;
private String encoding=null;
private String encodingSpecificationInfo=null;
private boolean definitive=true;
private boolean documentSpecifiedEncodingPossible=true;
private boolean endOfFile;
private final LoggerQueue logger=new LoggerQueue();
private static final String UTF_16="UTF-16";
private static final String UTF_16BE="UTF-16BE";
private static final String UTF_16LE="UTF-16LE";
private static final String UTF_8="UTF-8";
private static final String ISO_8859_1="ISO-8859-1";
private static final String EBCDIC="Cp037"; // aka IBM037, not guaranteed, but available on most platforms
private static final String WINDOWS_1252="Cp1252"; // aka Windows-1252, not guaranteed
private static final String DEFAULT_8BIT=EncodingDetector.isEncodingSupported(WINDOWS_1252)?WINDOWS_1252:ISO_8859_1;
// All of the following encodings are generally not supported in java and will usually throw an exception if decoding is attempted.
// Specified explicitly using Byte Order Mark:
private static final String SCSU="SCSU";
private static final String UTF_7="UTF-7";
private static final String UTF_EBCDIC="UTF-EBCDIC";
private static final String BOCU_1="BOCU-1";
private static final String UTF_32="UTF-32";
// Guessed from presence of 00 bytes in first four bytes:
private static final String UTF_32BE="UTF-32BE";
private static final String UTF_32LE="UTF-32LE";
static final Map codeUnitSizeMap=new HashMap();
static {
codeUnitSizeMap.put(UTF_8,8);
codeUnitSizeMap.put(ISO_8859_1,8);
codeUnitSizeMap.put(WINDOWS_1252.toUpperCase(),8);
codeUnitSizeMap.put(UTF_16,16);
codeUnitSizeMap.put(UTF_16BE,16);
codeUnitSizeMap.put(UTF_16LE,16);
codeUnitSizeMap.put(UTF_32,32);
codeUnitSizeMap.put(UTF_32BE,32);
codeUnitSizeMap.put(UTF_32LE,32);
}
public StreamEncodingDetector(final URLConnection urlConnection) throws IOException {
final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null;
// urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
final InputStream urlInputStream=urlConnection.getInputStream();
final String contentType=urlConnection.getContentType();
if (contentType!=null) {
encoding=Source.getCharsetParameterFromHttpHeaderValue(contentType);
if (encoding!=null && encoding.length()>0) {
boolean encodingSupported=false;
try {
if (Charset.isSupported(encoding)) encodingSupported=true;
} catch (IllegalCharsetNameException ex) {
if (encoding.charAt(0)=='"') {
String encodingWithoutQuotes=encoding.replace("\"","");
if (EncodingDetector.isEncodingSupported(encodingWithoutQuotes)) {
logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes, which have been ignored");
encodingSupported=true;
} else {
logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes");
}
encoding=encodingWithoutQuotes;
}
}
if (encodingSupported) {
inputStream=urlInputStream;
encodingSpecificationInfo="HTTP header Content-Type: "+contentType;
return;
}
logger.warn("Encoding "+encoding+" specified in HTTP header is not supported, attempting other means of detection");
}
}
inputStream=urlInputStream.markSupported() ? urlInputStream : new BufferedInputStream(urlInputStream);
init();
}
public StreamEncodingDetector(final InputStream inputStream) throws IOException {
this.inputStream=inputStream.markSupported() ? inputStream : new BufferedInputStream(inputStream);
init();
}
public InputStream getInputStream() {
return inputStream;
}
public String getEncoding() {
return encoding;
}
public String getEncodingSpecificationInfo() {
return encodingSpecificationInfo;
}
public boolean isDifinitive() {
return definitive;
}
public boolean isEndOfFile() {
return endOfFile;
}
public boolean isDocumentSpecifiedEncodingPossible() {
return documentSpecifiedEncodingPossible;
}
public LoggerQueue getLoggerQueue() {
return logger;
}
public Reader openReader() throws UnsupportedEncodingException {
if (encoding==null) return new InputStreamReader(inputStream,DEFAULT_8BIT); // encoding==null only if input stream is empty so use an arbitrary encoding.
if (!EncodingDetector.isEncodingSupported(encoding)) throw new UnsupportedEncodingException(encoding+" - "+encodingSpecificationInfo);
return new InputStreamReader(inputStream,encoding);
}
private boolean setEncoding(final String encoding, final String encodingSpecificationInfo, int skipChars, boolean endOfFile) throws IOException {
this.encoding=encoding;
this.encodingSpecificationInfo=encodingSpecificationInfo;
this.endOfFile=endOfFile;
for (int i=0; i=50% chance that encoding is UTF-16BE
}
// pattern X???
if (b4==0) {
// pattern X??0
if (b3==0) return setEncoding(UTF_32LE,"default 32-bit LE encoding (byte stream starts with pattern XX ?? 00 00)",0,false); // pattern X?00 most likely indicates UTF-32LE
// pattern X?X0
return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream stars with pattern XX ?? XX 00)",0,false); // Regardless of the second byte, assume the fourth 00 byte indicates UTF-16LE.
}
// pattern X??X
if (b2==0) {
// pattern X0?X
// Assuming the second 00 byte doesn't indicate a NUL character, and that it is very unlikely that this is a 32-bit encoding
// of a character outside of the BMP, we can assume that it indicates a 16-bit encoding.
// If the pattern is X00X, there is a 50/50 chance that the encoding is BE or LE, with one of the characters have a code that is a multiple of 0x100.
// This should be a very rare occurrence, and there is no more than a 50% chance that the encoding
// will be different to that assumed (UTF-16LE) without checking for this occurrence, so don't bother checking for it.
// If the pattern is X0XX, this is likely to indicate a 16-bit LE encoding with the second character > U+00FF.
return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream starts with pattern XX 00 ?? XX)",0,false);
}
// pattern XX?X
if (b3==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with pattern XX XX 00 XX)",0,false); // pattern XX0X likely to indicate a 16-bit BE encoding with the first character > U+00FF.
// pattern XXXX
// Although it is still possible that this is a 16-bit encoding with the first two characters > U+00FF
// Assume the more likely case of four 8-bit characters <= U+00FF.
// Check whether it fits some common EBCDIC strings that might be found at the start of a document:
if (b1==0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding ( detected)",0,false); // first four bytes are " detected)",0,false); // first four bytes are "= 0x80 indicate the presence of a multi-byte character, and there are many byte values that are illegal.
// Therefore, choose the only true 8-bit encoding that accepts all byte values and is guaranteed to be available on all java implementations.
return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present in first four bytes of stream)",0,false);
}
}