org.archive.resource.html.HTMLResourceFactory Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
There is a newer version: 1.1.9
package org.archive.resource.html;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
import org.archive.format.text.charset.StandardCharsetDetector;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceContainer;
import org.archive.resource.ResourceFactory;
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
import org.json.JSONException;
import org.json.JSONObject;

public class HTMLResourceFactory implements ResourceFactory {

	public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);

	protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
	protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";

	protected CharsetDetector charSetDetector = new StandardCharsetDetector();


	public Resource getResource(InputStream is, MetaData parentMetaData,
			ResourceContainer container) throws ResourceParseException, IOException {
		HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
		ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
		LexParser parser = new LexParser(epo);
		CDATALexer lex = new CDATALexer();

		// guess charset based on HTTP header and sniffed content chunk
		String charset = "UTF-8";
		is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
		byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
		is.mark(0);
		int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
		is.reset();
		if (chunkSize > 0) {
			JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
			HttpHeaders httpHeaders = new HttpHeaders();
			if (headers.has("Content-Type")) {
				try {
					httpHeaders.add("Content-Type", headers.getString("Content-Type"));
				} catch (JSONException e) { }
			}
			try {
				charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
			} catch (Exception e) {
				LOG.error("Failed to guess charset: " + e.getMessage());
			}
		}

		Page page;
		try {
			page = new Page(is, charset);
			lex.setPage(page);
			parser.doParse(lex);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
			throw new ResourceParseException(e);
		} catch (ParserException e) {
			e.printStackTrace();
			throw new ResourceParseException(e);
		} catch(OutOfMemoryError e) {
			throw new ResourceParseException(null);
		}

		return new HTMLResource(hmd,container);
	}
}