cat.inspiracio.html.HTMLDocumentBuilder Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of html-parser Show documentation

HTML-parser provides a parser for HTML 5 that produces HTML 5 document object model. It aims to be a Java-implementation of http://www.w3.org/TR/html5/. It is for use in the server. It does not implement features that are relevant in the client, like event handling. It is for use from javascript, via Java's scripting library.

The newest version!

/*
Copyright 2015 Alexander Bunkenburg 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cat.inspiracio.html;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/** Defines the API to obtain {@link HTMLDocument} instances from an HTML5
 * document. This class always parses complete documents.
 *
 * If you want to parse elements or document fragments, see {@link HTMLBuilder}.
 * */
public class HTMLDocumentBuilder extends HTMLBuilder{

	/** Make a builder with standard implementation. */
	public HTMLDocumentBuilder() {
		super(new HTMLDOMImplementation());
	}

	/** Make a builder with extended implementation.
	 * @param implementation ... */
	public HTMLDocumentBuilder(HTMLDOMImplementation implementation){
		super(implementation);
	}

	// business methods -------------------------

    /** Creates an HTMLDocument object with the minimal tree made
     * of the following elements: 
     * html , 
     * head ,
     * title , and 
     * body .
     * 
     * @return  A new HTMLDocument object.
     */
	public HTMLDocument newDocument(){
		return (HTMLDocument)htb.newDocument();
	}
	
	/** In this subclass, returns a complete Document, even if the source is just a fragment. */
	@Override public HTMLDocument parse(String s) throws SAXException, IOException{
		return parse(new InputSource(new StringReader(s)));
	}

	/** In this subclass, returns a complete Document, even if the source is just a fragment. */
	@Override public HTMLDocument parse(InputStream in) throws SAXException, IOException{
		return parse(new InputSource(in));
	}

	/** In this subclass, returns a complete Document, even if the source is just a fragment. */
	@Override public HTMLDocument parse(Reader in) throws SAXException, IOException{
		return parse(new InputSource(in));
	}

	/** In this subclass, returns a complete Document, even if the source is just a fragment. */
	@Override public HTMLDocument parse(InputSource source) throws SAXException, IOException{
		//This is the only parse(..) that really parses, the others just adapt parameters.
		HTMLDocument d=(HTMLDocument)htb.parse(source);
		return clean(d);
	}

}