All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.resource.html.HTMLResourceFactory Maven / Gradle / Ivy

There is a newer version: 1.3.0
Show newest version
package org.archive.resource.html;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;

import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceContainer;
import org.archive.resource.ResourceFactory;
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;

public class HTMLResourceFactory implements ResourceFactory {

	public Resource getResource(InputStream is, MetaData parentMetaData,
			ResourceContainer container) throws ResourceParseException, IOException {
		HTMLMetaData hmd = new HTMLMetaData(parentMetaData);
		ExtractingParseObserver epo = new ExtractingParseObserver(hmd);
		LexParser parser = new LexParser(epo);
		CDATALexer lex = new CDATALexer();
		// TODO: figure out charset:
		String charset = "UTF-8";
		Page page;
		try {
			page = new Page(is, charset);
			lex.setPage(page);
			parser.doParse(lex);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
			throw new ResourceParseException(e);
		} catch (ParserException e) {
			e.printStackTrace();
			throw new ResourceParseException(e);
		} catch(OutOfMemoryError e) {
			throw new ResourceParseException(null);
		}

		return new HTMLResource(hmd,container);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy