cat.inspiracio.html.HTMLBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of html-parser Show documentation
HTML-parser provides a parser for HTML 5 that produces HTML 5 document object model. It aims to be a Java-implementation of http://www.w3.org/TR/html5/. It is for use in the server. It does not implement features that are relevant in the client, like event handling. It is for use from javascript, via Java's scripting library.
The newest version!
/*
Copyright 2017 Alexander Bunkenburg 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cat.inspiracio.html;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;

import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import nu.validator.htmlparser.dom.HtmlDocumentBuilder;

/** Parses HTML nodes:
 * A full HTML document (with doctype) will be returned as HTMLDocument.
 * An html-element will also be returned as HTMLDocument, even without doctype.
 * A different single element will be returned as HTMLElement.
 * A text will be returned as Text.
 * Multiple nodes (for example, text, element, text) will be returned as DocumentFragment.
 * */
public class HTMLBuilder {

	// state -------------------------------------
	
	protected HtmlDocumentBuilder htb;

	// construction ------------------------------

	public HTMLBuilder(){
		HTMLDOMImplementation implementation=new HTMLDOMImplementation();
		htb=new HtmlDocumentBuilder(implementation);
	}

	public HTMLBuilder(HTMLDOMImplementation implementation){
		htb=new HtmlDocumentBuilder(implementation);
	}

	// configuration -----------------------------
	
	public HTMLDOMImplementation getDOMImplementation(){
		return (HTMLDOMImplementation)htb.getDOMImplementation();
	}

	/** Register a class as implementation for a custom element.
	 *  
	 * @param cl A class for the custom element. 
	 * 	The class must have a public constructor with one parameter of type HTMLDocumentImp 
	 * 	that calls the super-constructor with two parameters: the same HTMLDocumentImp and the desired tag name. 
	 * 
	 * @param tags Registers the class for these tag names. 
	 * 
	 * 	There may be zero tag names. In that case, the implementation will try to get the tag name
	 * 	from the class by instantiating it (with owner == null) and calling getTagName().
	 * 
	 * 	There may be exactly one tag name. That is a usual case.
	 * 
	 * 	There may be several tag names. In that case, the constructor of the class must remember the
	 * 	tag name.
	 * 
	 *  */
	public void register(Class cl, String... tags) {
		getDOMImplementation().register(cl, tags);
	}

	// business methods: parsing --------------------
	
	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment.
	 * @param s ...
	 * @return ...
	 * @throws SAXException ...
	 * @throws IOException ... */
	public Node parse(String s) throws SAXException, IOException{
		InputSource source=new InputSource(new StringReader(s));
		return parse(source);
	}

	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment.
	 * @param in ...
	 * @return ...
	 * @throws SAXException ...
	 * @throws IOException ... */
	public Node parse(InputStream in) throws SAXException, IOException{
		return parse(new InputSource(in));
	}

	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment.
	 * @param in ...
	 * @return ...
	 * @throws SAXException ...
	 * @throws IOException ... */
	public Node parse(Reader in) throws SAXException, IOException{
		return parse(new InputSource(in));
	}

	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment.
	 * @param source ...
	 * @return ...
	 * @throws SAXException ...
	 * @throws IOException ... */
	public Node parse(InputSource source) throws SAXException, IOException{
		//This is the only parse(..) that really parses.
		
		//First, parse a document, then check which of , ,  are fake.
		HTMLDocument d=(HTMLDocument)htb.parse(source);
		
		HTMLHtmlElement html=d.getHtml();
		HTMLHeadElement head=d.getHead();
		HTMLBodyElement body=d.getBody();
		boolean fakeHtml=html.hasAttribute("fake");
		boolean fakeHead=head.hasAttribute("fake");
		boolean fakeBody=body.hasAttribute("fake");
		clean(d);
		
		//It really is a document.
		if(!fakeHtml){
			
			if(fakeHead && !head.hasChildNodes())
				html.removeChild(head);
			if(fakeBody && !body.hasChildNodes())
				html.removeChild(body);
			
			return d;
		}
		
		//head body
		if(!fakeHead && !fakeBody){
			DocumentFragment fragment=d.createDocumentFragment();
			fragment.appendChild(head);
			fragment.appendChild(body);
			return fragment;
		}
		
		//head
		if(!fakeHead && fakeBody)
			return head;
		
		//body
		if(fakeHead && !fakeBody)
			return body;

		//All are fake: html, head, body.
		
		//How many real nodes have we got?
		NodeList inHead=head.getChildNodes();
		int headLength=inHead.getLength();
		NodeList inBody=body.getChildNodes();
		int bodyLength=inBody.getLength();
		
		//just one real child
		if(headLength==1 && bodyLength==0)
			return head.getFirstChild();
		if(headLength==0 && bodyLength==1)
			return body.getFirstChild();
		
		//collect nodes from head and body
		DocumentFragment fragment=d.createDocumentFragment();
		for(int i=0; i