org.archive.resource.html.HTMLResourceFactory Maven / Gradle / Ivy
package org.archive.resource.html;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
import org.archive.format.text.charset.StandardCharsetDetector;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceContainer;
import org.archive.resource.ResourceFactory;
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
import org.json.JSONException;
import org.json.JSONObject;
public class HTMLResourceFactory implements ResourceFactory {
public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
protected CharsetDetector charSetDetector = new StandardCharsetDetector();
public Resource getResource(InputStream is, MetaData parentMetaData,
ResourceContainer container) throws ResourceParseException, IOException {
HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
LexParser parser = new LexParser(epo);
CDATALexer lex = new CDATALexer();
// guess charset based on HTTP header and sniffed content chunk
String charset = "UTF-8";
is = new BufferedInputStream(is, CHARSET_GUESS_CHUNK_SIZE);
byte[] chunk = new byte[CHARSET_GUESS_CHUNK_SIZE];
is.mark(0);
int chunkSize = is.read(chunk, 0, CHARSET_GUESS_CHUNK_SIZE);
is.reset();
if (chunkSize > 0) {
JSONObject headers = JSONUtils.extractObject(hmd.getTopMetaData(), HTTP_HEADER_PATH);
HttpHeaders httpHeaders = new HttpHeaders();
if (headers.has("Content-Type")) {
try {
httpHeaders.add("Content-Type", headers.getString("Content-Type"));
} catch (JSONException e) { }
}
try {
charset = charSetDetector.getCharset(chunk, chunkSize, httpHeaders);
} catch (Exception e) {
LOG.error("Failed to guess charset: " + e.getMessage());
}
}
Page page;
try {
page = new Page(is, charset);
lex.setPage(page);
parser.doParse(lex);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
throw new ResourceParseException(e);
} catch (ParserException e) {
e.printStackTrace();
throw new ResourceParseException(e);
} catch(OutOfMemoryError e) {
throw new ResourceParseException(null);
}
return new HTMLResource(hmd,container);
}
}