All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.databene.html.HTML2XML Maven / Gradle / Ivy

/*
 * (c) Copyright 2007-2010 by Volker Bergmann. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, is permitted under the terms of the
 * GNU General Public License.
 *
 * For redistributing this software or a derivative work under a license other
 * than the GPL-compatible Free Software License as defined by the Free
 * Software Foundation or approved by OSI, you must first obtain a commercial
 * license to this software product from Volker Bergmann.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * WITHOUT A WARRANTY OF ANY KIND. ALL EXPRESS OR IMPLIED CONDITIONS,
 * REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE
 * HEREBY EXCLUDED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

package org.databene.html;

import org.databene.commons.CollectionUtil;
import org.databene.commons.IOUtil;
import org.databene.commons.SystemInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.text.ParseException;
import java.util.Set;
import java.util.Stack;
import java.util.Map;

/**
 * Provides utility methods for converting HTML to XML.
*
* Created: 25.01.2007 17:10:37 * @author Volker Bergmann */ public class HTML2XML { private static final Logger LOGGER = LoggerFactory.getLogger(HTML2XML.class); private static final Set COMMON_CODES = CollectionUtil.toSet("lt", "gt", "amp"); public static String convert(String html) throws ParseException { Reader reader = new StringReader(html); StringWriter writer = new StringWriter(); try { ConversionContext context = new ConversionContext(reader, writer, "UTF-8"); convert(context); return writer.getBuffer().toString(); } catch (IOException e) { throw new RuntimeException(e); // this is not supposed to happen } finally { IOUtil.close(reader); } } public static void convert(Reader reader, OutputStream out, String encoding) throws ParseException, UnsupportedEncodingException { Writer writer = new OutputStreamWriter(out, encoding); try { ConversionContext context = new ConversionContext(reader, writer, encoding); convert(context); } catch (IOException e) { throw new RuntimeException(e); // this is not supposed to happen } finally { IOUtil.close(reader); IOUtil.close(writer); } } private static void convert(ConversionContext context) throws IOException, ParseException { // TODO v0.5.x use XML serializer int token; while ((token = context.tokenizer.nextToken()) != HTMLTokenizer.END) { switch (token) { case HTMLTokenizer.START_TAG: case HTMLTokenizer.CLOSED_TAG: ensureXmlHeader(context); // ignore scripts if ("script".equalsIgnoreCase(context.tokenizer.name())) // TODO v0.5.x test script handling continue; String lcTagName = context.tokenizer.name().toLowerCase(); if (!"html".equals(lcTagName) && !context.rootCreated) ensureRootElement(context); else if ("html".equals(lcTagName) && context.rootCreated) { // ignore html element if there already was one LOGGER.warn("Malformed HTML document: misplaced element"); break; } else { if (context.path.size() > 0) { String lastTagName = context.path.peek(); if (HTMLUtil.isEmptyTag(lastTagName) && !context.tokenizer.name().equals(lastTagName)) { context.writer.write("'); context.path.pop(); } } } context.rootCreated = true; if (token == HTMLTokenizer.CLOSED_TAG) { writeEmptyTag(context.writer, context.tokenizer); } else { writeStartTag(context.writer, context.tokenizer); context.path.push(context.tokenizer.name()); } break; case (HTMLTokenizer.END_TAG): if ("script".equalsIgnoreCase(context.tokenizer.name())) continue; boolean done = false; if (contains(context.path, context.tokenizer.name())) { do { String pathTagName = context.path.pop(); context.writer.write("'); if (pathTagName.equals(context.tokenizer.name())) done = true; } while (!done); } if ("html".equalsIgnoreCase(context.tokenizer.name())) return; break; case HTMLTokenizer.TEXT: ensureXmlHeader(context); String text = context.tokenizer.text(); if (text != null && text.trim().length() > 0) ensureRootElement(context); writeText(context.writer, text); break; case HTMLTokenizer.COMMENT: ensureRootElement(context); String comment = context.tokenizer.text(); int s = comment.indexOf(""); comment = ""; writeXml(context.writer, comment); break; case HTMLTokenizer.DOCUMENT_TYPE: // leave out doc type break; case HTMLTokenizer.PROCESSING_INSTRUCTION: String piText = context.tokenizer.text(); writeXml(context.writer, piText); if (piText.startsWith(" 0) { String tagName = context.path.pop(); context.writer.write("'); } } private static void ensureXmlHeader(ConversionContext context) throws IOException { if (!context.xmlHeaderCreated) { context.writer.write("" + SystemInfo.getLineSeparator()); context.xmlHeaderCreated = true; } } private static void ensureRootElement(ConversionContext context) throws IOException { ensureXmlHeader(context); // ensure that there is an html root tag if (!context.rootCreated && !"html".equals(context.tokenizer.name())) { writeStartTag(context.writer, "html"); context.path.push("html"); context.rootCreated = true; } } private static boolean contains(Stack path, String name) { for (String tagName : path) if (tagName.equals(name)) return true; return false; } private static void writeEmptyTag(Writer writer, HTMLTokenizer tokenizer) throws IOException { writer.write('<' + tokenizer.name()); writeAttributes(writer, tokenizer); writer.write("/>"); } private static void writeStartTag(Writer writer, HTMLTokenizer tokenizer) throws IOException { writer.write('<' + tokenizer.name()); writeAttributes(writer, tokenizer); writer.write('>'); } private static void writeStartTag(Writer writer, String name) throws IOException { writer.write('<' + name + '>'); } private static void writeAttributes(Writer writer, HTMLTokenizer tokenizer) throws IOException { for (Map.Entry entry : tokenizer.attributes().entrySet()) { String value = entry.getValue(); char quote = '"'; if (value == null) value = ""; else if (value.contains("\"")) quote = '\''; writer.write(' '); writer.write(entry.getKey()); writer.write('='); writer.write(quote); writeText(writer, value); writer.write(quote); } } private static void writeXml(Writer writer, String s) throws IOException { s = resolveEntities(writer, s); writer.write(s); } private static void writeText(Writer writer, String s) throws IOException { s = s.replace("<", "<"); s = s.replace(">", ">"); s = resolveEntities(writer, s); writer.write(s); } private static String resolveEntities(Writer writer, String s) throws IOException { int i; while ((i = s.indexOf('&')) >= 0) { HTMLEntity entity = HTMLEntity.getEntity(s, i); if (entity != null) { writer.write(s.substring(0, i + 1)); if (COMMON_CODES.contains(entity.htmlCode)) writer.write(entity.htmlCode + ';'); else writer.write("#" + entity.xmlCode + ";"); s = s.substring(s.indexOf(';', i) + 1); } else { writer.write(s.substring(0, i)); writer.write("&"); s = s.substring(i + 1); } } return s; } private static class ConversionContext { public String encoding; Writer writer; HTMLTokenizer tokenizer; Stack path; boolean xmlHeaderCreated; boolean rootCreated; ConversionContext(Reader reader, Writer writer, String encoding) { this.tokenizer = new DefaultHTMLTokenizer(reader); this.path = new Stack(); this.xmlHeaderCreated = false; this.rootCreated = false; this.writer = writer; this.encoding = encoding; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy