org.databene.formats.html.HTML2XML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of databene-formats Show documentation
'Databene Formats' is an open source software library for supporting data file and other formats like CSV, fixed width files, XLS, Properties and Regex. It is designed for multithreaded use and high performance.
There is a newer version: 1.0.14
Show newest version
/*
 * Copyright (C) 2011-2014 Volker Bergmann ([email protected]).
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.databene.formats.html;

import org.databene.commons.CollectionUtil;
import org.databene.commons.Encodings;
import org.databene.commons.IOUtil;
import org.databene.commons.SystemInfo;
import org.databene.commons.xml.XMLUtil;
import org.databene.formats.html.parser.DefaultHTMLTokenizer;
import org.databene.formats.html.parser.HTMLTokenizer;
import org.databene.formats.html.util.HTMLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

import java.io.*;
import java.text.ParseException;
import java.util.Set;
import java.util.Stack;
import java.util.Map;

/**
 * Provides utility methods for converting HTML to XML.

 * 

 * Created: 25.01.2007 17:10:37
 * @author Volker Bergmann
 */
public class HTML2XML {
	
	private static final Logger LOGGER = LoggerFactory.getLogger(HTML2XML.class);
	private static final Set COMMON_CODES = CollectionUtil.toSet("lt", "gt", "amp");

    public static String convert(String html) throws ParseException {
        Reader reader = new StringReader(html);
        StringWriter writer = new StringWriter();
        try {
        	ConversionContext context = new ConversionContext(reader, writer, "UTF-8");
            convert(context);
            return writer.getBuffer().toString();
        } catch (IOException e) {
            throw new RuntimeException(e); // this is not supposed to happen
        } finally {
            IOUtil.close(reader);
        }
    }

    public static void convert(Reader reader, OutputStream out, String encoding) 
    		throws ParseException, UnsupportedEncodingException {
    	Writer writer = new OutputStreamWriter(out, encoding);
        try {
        	ConversionContext context = new ConversionContext(reader, writer, encoding);
            convert(context);
        } catch (IOException e) {
            throw new RuntimeException(e); // this is not supposed to happen
        } finally {
            IOUtil.close(reader);
            IOUtil.close(writer);
        }
    }
    
	public static Document parseHtmlAsXml(String url, boolean namespaceAware) 
			throws IOException, ParseException, UnsupportedEncodingException {
		String html = IOUtil.getContentOfURI(url);
		String xml = convert(html);
		return XMLUtil.parse(new ByteArrayInputStream(xml.getBytes(Encodings.UTF_8)), namespaceAware, null, null, null, null);
	}

    // private helpers -------------------------------------------------------------------------------------------------

    private static void convert(ConversionContext context) throws IOException, ParseException {
    	// TODO use XML serializer
        int token;
        while ((token = context.tokenizer.nextToken()) != HTMLTokenizer.END) {
            switch (token) {
                case HTMLTokenizer.START_TAG:
                case HTMLTokenizer.CLOSED_TAG:
                	ensureXmlHeader(context);
                	// ignore scripts
                    if ("script".equalsIgnoreCase(context.tokenizer.name())) // TODO test script handling
                        continue;
                    String lcTagName = context.tokenizer.name().toLowerCase();
					if (!"html".equals(lcTagName) && !context.rootCreated)
                        ensureRootElement(context);
					else if ("html".equals(lcTagName) && context.rootCreated) {
						// ignore html element if there already was one
						LOGGER.warn("Malformed HTML document: misplaced  element");
						break;
					} else {
                        if (context.path.size() > 0) {
                            String lastTagName = context.path.peek();
                            if (HTMLUtil.isEmptyTag(lastTagName) && !context.tokenizer.name().equals(lastTagName)) {
                                context.writer.write("');
                                context.path.pop();
                            }
                        }
                    }
                    context.rootCreated = true;
					if (token == HTMLTokenizer.CLOSED_TAG) {
	                    writeEmptyTag(context.writer, context.tokenizer);
					} else {
	                    writeStartTag(context.writer, context.tokenizer);
	                    context.path.push(context.tokenizer.name());
					}
                    break;
                case (HTMLTokenizer.END_TAG):
                    if ("script".equalsIgnoreCase(context.tokenizer.name()))
                        continue;
                    boolean done = false;
                    if (contains(context.path, context.tokenizer.name())) {
                        do {
                            String pathTagName = context.path.pop();
                            context.writer.write("');
                            if (pathTagName.equals(context.tokenizer.name()))
                                done = true;
                        } while (!done);
                    }
                    if ("html".equalsIgnoreCase(context.tokenizer.name()))
                        return;
                    break;
                case HTMLTokenizer.TEXT:
                	ensureXmlHeader(context);
                	String text = context.tokenizer.text();
                	if (text != null && text.trim().length() > 0)
                		ensureRootElement(context);
                    writeText(context.writer, text);
                    break;
                case HTMLTokenizer.COMMENT:
                	ensureRootElement(context);
                	String comment = context.tokenizer.text();
                	int s = comment.indexOf("");
                	comment = "";
                    writeXml(context.writer, comment);
                    break;
                case HTMLTokenizer.DOCUMENT_TYPE:
                    // leave out doc type
                    break;
                case HTMLTokenizer.PROCESSING_INSTRUCTION:
                	String piText = context.tokenizer.text();
                	writeXml(context.writer, piText);
                	if (piText.startsWith(" 0) {
            String tagName = context.path.pop();
            context.writer.write("');
        }
    }

    private static void ensureXmlHeader(ConversionContext context) throws IOException {
		if (!context.xmlHeaderCreated) {
			context.writer.write("" + 
					SystemInfo.getLineSeparator());
			context.xmlHeaderCreated = true;
		}
    }

    private static void ensureRootElement(ConversionContext context) throws IOException {
    	ensureXmlHeader(context);
		// ensure that there is an html root tag
		if (!context.rootCreated && !"html".equals(context.tokenizer.name())) {
			writeStartTag(context.writer, "html");
			context.path.push("html");
			context.rootCreated = true;
		}
	}

    private static boolean contains(Stack path, String name) {
        for (String tagName : path)
            if (tagName.equals(name))
                return true;
        return false;
    }

    private static void writeEmptyTag(Writer writer, HTMLTokenizer tokenizer) throws IOException {
        writer.write('<' + tokenizer.name());
        writeAttributes(writer, tokenizer);
        writer.write("/>");
    }

    private static void writeStartTag(Writer writer, HTMLTokenizer tokenizer) throws IOException {
        writer.write('<' + tokenizer.name());
        writeAttributes(writer, tokenizer);
        writer.write('>');
    }

    private static void writeStartTag(Writer writer, String name) throws IOException {
        writer.write('<' + name + '>');
    }

    private static void writeAttributes(Writer writer, HTMLTokenizer tokenizer) throws IOException {
        for (Map.Entry entry : tokenizer.attributes().entrySet()) {
            String value = entry.getValue();
            char quote = '"';
            if (value == null)
                value = "";
            else if (value.contains("\""))
                quote = '\'';
            writer.write(' ');
            writer.write(entry.getKey());
            writer.write('=');
            writer.write(quote);
            writeText(writer, value);
            writer.write(quote);
        }
    }

    private static void writeXml(Writer writer, String s) throws IOException {
        s = resolveEntities(writer, s);
        writer.write(s);
    }

    private static void writeText(Writer writer, String s) throws IOException {
        s = s.replace("<", "<");
        s = s.replace(">", ">");
        s = resolveEntities(writer, s);
        writer.write(s);
    }

	private static String resolveEntities(Writer writer, String s) throws IOException {
	    int i;
        while ((i = s.indexOf('&')) >= 0) {
            HtmlEntity entity = HtmlEntity.getEntity(s, i);
            if (entity != null) {
                writer.write(s.substring(0, i + 1));
                if (COMMON_CODES.contains(entity.htmlCode))
                	writer.write(entity.htmlCode + ';');
                else
                	writer.write("#" + entity.xmlCode + ";");
                s = s.substring(s.indexOf(';', i) + 1);
            } else {
                writer.write(s.substring(0, i));
                writer.write("&");
                s = s.substring(i + 1);
            }
        }
	    return s;
    }

    private static class ConversionContext {
    	
    	public String encoding;
		Writer writer;
    	HTMLTokenizer tokenizer;
    	Stack path;
    	boolean xmlHeaderCreated;
    	boolean rootCreated;
    	
    	ConversionContext(Reader reader, Writer writer, String encoding) {
    		this.tokenizer = new DefaultHTMLTokenizer(reader);
    		this.path = new Stack();
    		this.xmlHeaderCreated = false;
    		this.rootCreated = false;
    		this.writer = writer;
    		this.encoding = encoding;
    	}
    }

}