All Downloads are FREE. Search and download functionalities are using the official Maven repository.

au.id.jericho.lib.html.EncodedSource Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.

There is a newer version: 3.4
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.3
// Copyright (C) 2006 Martin Jericho
// http://sourceforge.net/projects/jerichohtml/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.util.*;
import java.io.*;
import java.net.*;

/**
 * Based on information in:
 * http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
 * http://www.w3.org/TR/html401/charset.html#h-5.2
 */
final class EncodedSource {
	public final Reader Reader;
	public final String Encoding;
	public final String EncodingSpecificationInfo;
	public final HttpURLConnection HttpURLConnection;
	
	private static final int PREVIEW_BUFFER_SIZE=2048;
	private static final int PREVIEW_MAX_BYTES=PREVIEW_BUFFER_SIZE*4; // Cater for each character in the preview buffer requiring an average of 4 bytes, which is twice what would reasonably be expected but ensures the reset() call on the BufferedInputStream doesn't fail.

	private static final String UTF_32="UTF-32"; // not supported in Java, will throw an exception.
	private static final String UTF_16="UTF-16";
	private static final String UTF_16BE="UTF-16BE";
	private static final String UTF_16LE="UTF-16LE";
	private static final String UTF_8="UTF-8";
	private static final String EBCDIC="Cp037";
	private static final String ISO_8859_1="ISO-8859-1";
	
	EncodedSource(final InputStream inputStream, final String encoding, final String encodingSpecificationInfo, final HttpURLConnection httpURLConnection) throws UnsupportedEncodingException {
		if (encoding==null)
			Reader=new InputStreamReader(inputStream); // Reader will be empty so the encoding is arbitrary.
		else
			Reader=new InputStreamReader(inputStream,encoding);
		Encoding=encoding;
		EncodingSpecificationInfo=encodingSpecificationInfo;
		HttpURLConnection=httpURLConnection;
	}

	public static EncodedSource construct(final URL url) throws IOException {
		final URLConnection urlConnection=url.openConnection();
		final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null;
		// urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
		final InputStream inputStream=urlConnection.getInputStream();
		final String contentType=urlConnection.getContentType();
		if (contentType!=null) {
			final String charset=Source.getCharsetParameterFromHttpHeaderValue(contentType);
			if (charset!=null) return new EncodedSource(inputStream,charset,"HTTP header Content-Type: "+contentType,httpURLConnection);
		}
		return construct(inputStream,httpURLConnection);
	}
	
	public static EncodedSource construct(final InputStream inputStream, final HttpURLConnection httpURLConnection) throws IOException {
		final BufferedInputStream in=(inputStream instanceof BufferedInputStream) ? (BufferedInputStream)inputStream : new BufferedInputStream(inputStream);
		in.mark(PREVIEW_MAX_BYTES);
		final String preliminaryEncoding=getPreliminaryEncoding(in);
		if (preliminaryEncoding==null) return new EncodedSource(in,null,"empty input stream",httpURLConnection);
		in.reset();
		final Source previewSource=getPreviewSource(in,preliminaryEncoding);
		in.reset();
		if (previewSource.getEncoding()!=null) return new EncodedSource(in,previewSource.encoding,previewSource.encodingSpecificationInfo,httpURLConnection);
		// No explicit encoding specified in document
		// If the document is not XML and is being loaded using HTTP, use the default specified by HTTP which is ISO-8859-1.
		// For the encoding to be ISO-8859-1, the preliminary encoding must be UTF-8.
		if (httpURLConnection!=null && preliminaryEncoding==UTF_8 && !previewSource.isXML())
			return new EncodedSource(in,ISO_8859_1,"HTTP default 8-bit encoding for non-XML document",httpURLConnection);
		// Just use the preliminary encoding (UTF-8 or UTF-16), which must be the case for an XML document without an XML declaration.
		return new EncodedSource(in,preliminaryEncoding,"XML default matching first four bytes of input stream",httpURLConnection);
	}

	private static String getPreliminaryEncoding(BufferedInputStream bufferedInputStream) throws IOException {
		final int b1=bufferedInputStream.read();
		if (b1==-1) return null;
		final int b2=bufferedInputStream.read();
		final int b3=bufferedInputStream.read();
		final int b4=bufferedInputStream.read();
		if ((b1&0xFE)==0xFE && b2==(b1^1)) { // first two bytes are FEFF or FFFE
			return (b3==0) ? UTF_32 : UTF_16;
		} else if (b1==0) {
			if (b2==0 || b4==0) return UTF_32;
			return UTF_16BE;
		} else if (b2==0) {
			return (b3==0) ? UTF_32 : UTF_16LE;
		} else if (b1==0x4C && b2==0x6F && b3==0xA7 && b4==0x94) return EBCDIC; // This only recognises "




© 2015 - 2025 Weber Informatics LLC | Privacy Policy