cat.inspiracio.html.HTMLBuilder Maven / Gradle / Ivy

Go to download
/*
Copyright 2017 Alexander Bunkenburg 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cat.inspiracio.html;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;

import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import nu.validator.htmlparser.dom.HtmlDocumentBuilder;

/** Parses HTML nodes:
 * A full HTML document (with doctype) will be returned as HTMLDocument.
 * A single HTML element will be returned as HTMLElement.
 * A text will be returned as Text.
 * Multiple nodes (for example, text, element, text) will be returned as DocumentFragment.
 * */
public class HTMLBuilder {

	// state -------------------------------------
	
	protected HtmlDocumentBuilder htb;

	// construction ------------------------------
	
	public HTMLBuilder(){
		HTMLDOMImplementation implementation=new HTMLDOMImplementation();
		htb=new HtmlDocumentBuilder(implementation);
	}

	// configuration -----------------------------
	
	public HTMLDOMImplementation getDOMImplementation(){
		return (HTMLDOMImplementation)htb.getDOMImplementation();
	}

	/** Register a class as implementation for a custom element.
	 *  
	 * @param cl A class for the custom element. 
	 * 	The class must have a public constructor with one parameter of type HTMLDocumentImp 
	 * 	that calls the super-constructor with two parameters: the same HTMLDocumentImp and the desired tag name. 
	 * 
	 * @param tags Registers the class for these tag names. 
	 * 
	 * 	There may be zero tag names. In that case, the implementation will try to get the tag name
	 * 	from the class by instantiating it (with owner == null) and calling getTagName().
	 * 
	 * 	There may be exactly one tag name. That is a usual case.
	 * 
	 * 	There may be several tag names. In that case, the constructor of the class must remember the
	 * 	tag name.
	 * 
	 *  */
	public void register(Class cl, String... tags) {
		getDOMImplementation().register(cl, tags);
	}

	// business methods: parsing --------------------
	
	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment. */
	public Node parse(String s) throws SAXException, IOException{
		InputSource source=new InputSource(new StringReader(s));
		return parse(source);
	}

	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment. */
	public Node parse(InputStream in) throws SAXException, IOException{
		return parse(new InputSource(in));
	}

	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment. */
	public Node parse(Reader in) throws SAXException, IOException{
		return parse(new InputSource(in));
	}

	/** Parses and returns an HTMLDocument, an HTMLElement, a Text, or a DocumentFragment. */
	public Node parse(InputSource source) throws SAXException, IOException{
		//This is the only parse() method that really does work.
		
		//First, parse a document, then check which of , ,  are fake.
		HTMLDocument d=(HTMLDocument)htb.parse(source);
		
		HTMLHtmlElement html=d.getHtml();
		HTMLHeadElement head=d.getHead();
		HTMLBodyElement body=d.getBody();
		boolean fakeHtml=html.hasAttribute("fake");
		boolean fakeHead=head.hasAttribute("fake");
		boolean fakeBody=body.hasAttribute("fake");
		clean(d);
		
		//It really is a document.
		if(!fakeHtml){
			
			if(fakeHead && !head.hasChildNodes())
				html.removeChild(head);
			if(fakeBody && !body.hasChildNodes())
				html.removeChild(body);
			
			return d;
		}
		
		//head body
		if(!fakeHead && !fakeBody){
			DocumentFragment fragment=d.createDocumentFragment();
			fragment.appendChild(head);
			fragment.appendChild(body);
			return fragment;
		}
		
		//head
		if(!fakeHead && fakeBody)
			return head;
		
		//body
		if(fakeHead && !fakeBody)
			return body;

		//All are fake: html, head, body.
		
		//How many real nodes have we got?
		NodeList inHead=head.getChildNodes();
		int headLength=inHead.getLength();
		NodeList inBody=body.getChildNodes();
		int bodyLength=inBody.getLength();
		
		//just one real child
		if(headLength==1 && bodyLength==0)
			return head.getFirstChild();
		if(headLength==0 && bodyLength==1)
			return body.getFirstChild();
		
		//collect nodes from head and body
		DocumentFragment fragment=d.createDocumentFragment();
		for(int i=0; i