org.databene.formats.html.HTML2XML Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of databene-formats Show documentation
Show all versions of databene-formats Show documentation
'Databene Formats' is an open source software library for supporting
data file and other formats like CSV, fixed width files, XLS, Properties and Regex.
It is designed for multithreaded use and high performance.
/*
* Copyright (C) 2011-2014 Volker Bergmann ([email protected]).
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.databene.formats.html;
import org.databene.commons.CollectionUtil;
import org.databene.commons.Encodings;
import org.databene.commons.IOUtil;
import org.databene.commons.SystemInfo;
import org.databene.commons.xml.XMLUtil;
import org.databene.formats.html.parser.DefaultHTMLTokenizer;
import org.databene.formats.html.parser.HTMLTokenizer;
import org.databene.formats.html.util.HTMLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import java.io.*;
import java.text.ParseException;
import java.util.Set;
import java.util.Stack;
import java.util.Map;
/**
* Provides utility methods for converting HTML to XML.
*
* Created: 25.01.2007 17:10:37
* @author Volker Bergmann
*/
public class HTML2XML {
private static final Logger LOGGER = LoggerFactory.getLogger(HTML2XML.class);
private static final Set COMMON_CODES = CollectionUtil.toSet("lt", "gt", "amp");
public static String convert(String html) throws ParseException {
Reader reader = new StringReader(html);
StringWriter writer = new StringWriter();
try {
ConversionContext context = new ConversionContext(reader, writer, "UTF-8");
convert(context);
return writer.getBuffer().toString();
} catch (IOException e) {
throw new RuntimeException(e); // this is not supposed to happen
} finally {
IOUtil.close(reader);
}
}
public static void convert(Reader reader, OutputStream out, String encoding)
throws ParseException, UnsupportedEncodingException {
Writer writer = new OutputStreamWriter(out, encoding);
try {
ConversionContext context = new ConversionContext(reader, writer, encoding);
convert(context);
} catch (IOException e) {
throw new RuntimeException(e); // this is not supposed to happen
} finally {
IOUtil.close(reader);
IOUtil.close(writer);
}
}
public static Document parseHtmlAsXml(String url, boolean namespaceAware)
throws IOException, ParseException, UnsupportedEncodingException {
String html = IOUtil.getContentOfURI(url);
String xml = convert(html);
return XMLUtil.parse(new ByteArrayInputStream(xml.getBytes(Encodings.UTF_8)), namespaceAware, null, null, null, null);
}
// private helpers -------------------------------------------------------------------------------------------------
private static void convert(ConversionContext context) throws IOException, ParseException {
// TODO use XML serializer
int token;
while ((token = context.tokenizer.nextToken()) != HTMLTokenizer.END) {
switch (token) {
case HTMLTokenizer.START_TAG:
case HTMLTokenizer.CLOSED_TAG:
ensureXmlHeader(context);
// ignore scripts
if ("script".equalsIgnoreCase(context.tokenizer.name())) // TODO test script handling
continue;
String lcTagName = context.tokenizer.name().toLowerCase();
if (!"html".equals(lcTagName) && !context.rootCreated)
ensureRootElement(context);
else if ("html".equals(lcTagName) && context.rootCreated) {
// ignore html element if there already was one
LOGGER.warn("Malformed HTML document: misplaced element");
break;
} else {
if (context.path.size() > 0) {
String lastTagName = context.path.peek();
if (HTMLUtil.isEmptyTag(lastTagName) && !context.tokenizer.name().equals(lastTagName)) {
context.writer.write("" + lastTagName + '>');
context.path.pop();
}
}
}
context.rootCreated = true;
if (token == HTMLTokenizer.CLOSED_TAG) {
writeEmptyTag(context.writer, context.tokenizer);
} else {
writeStartTag(context.writer, context.tokenizer);
context.path.push(context.tokenizer.name());
}
break;
case (HTMLTokenizer.END_TAG):
if ("script".equalsIgnoreCase(context.tokenizer.name()))
continue;
boolean done = false;
if (contains(context.path, context.tokenizer.name())) {
do {
String pathTagName = context.path.pop();
context.writer.write("" + pathTagName + '>');
if (pathTagName.equals(context.tokenizer.name()))
done = true;
} while (!done);
}
if ("html".equalsIgnoreCase(context.tokenizer.name()))
return;
break;
case HTMLTokenizer.TEXT:
ensureXmlHeader(context);
String text = context.tokenizer.text();
if (text != null && text.trim().length() > 0)
ensureRootElement(context);
writeText(context.writer, text);
break;
case HTMLTokenizer.COMMENT:
ensureRootElement(context);
String comment = context.tokenizer.text();
int s = comment.indexOf("");
comment = "";
writeXml(context.writer, comment);
break;
case HTMLTokenizer.DOCUMENT_TYPE:
// leave out doc type
break;
case HTMLTokenizer.PROCESSING_INSTRUCTION:
String piText = context.tokenizer.text();
writeXml(context.writer, piText);
if (piText.startsWith(" 0) {
String tagName = context.path.pop();
context.writer.write("" + tagName + '>');
}
}
private static void ensureXmlHeader(ConversionContext context) throws IOException {
if (!context.xmlHeaderCreated) {
context.writer.write("" +
SystemInfo.getLineSeparator());
context.xmlHeaderCreated = true;
}
}
private static void ensureRootElement(ConversionContext context) throws IOException {
ensureXmlHeader(context);
// ensure that there is an html root tag
if (!context.rootCreated && !"html".equals(context.tokenizer.name())) {
writeStartTag(context.writer, "html");
context.path.push("html");
context.rootCreated = true;
}
}
private static boolean contains(Stack path, String name) {
for (String tagName : path)
if (tagName.equals(name))
return true;
return false;
}
private static void writeEmptyTag(Writer writer, HTMLTokenizer tokenizer) throws IOException {
writer.write('<' + tokenizer.name());
writeAttributes(writer, tokenizer);
writer.write("/>");
}
private static void writeStartTag(Writer writer, HTMLTokenizer tokenizer) throws IOException {
writer.write('<' + tokenizer.name());
writeAttributes(writer, tokenizer);
writer.write('>');
}
private static void writeStartTag(Writer writer, String name) throws IOException {
writer.write('<' + name + '>');
}
private static void writeAttributes(Writer writer, HTMLTokenizer tokenizer) throws IOException {
for (Map.Entry entry : tokenizer.attributes().entrySet()) {
String value = entry.getValue();
char quote = '"';
if (value == null)
value = "";
else if (value.contains("\""))
quote = '\'';
writer.write(' ');
writer.write(entry.getKey());
writer.write('=');
writer.write(quote);
writeText(writer, value);
writer.write(quote);
}
}
private static void writeXml(Writer writer, String s) throws IOException {
s = resolveEntities(writer, s);
writer.write(s);
}
private static void writeText(Writer writer, String s) throws IOException {
s = s.replace("<", "<");
s = s.replace(">", ">");
s = resolveEntities(writer, s);
writer.write(s);
}
private static String resolveEntities(Writer writer, String s) throws IOException {
int i;
while ((i = s.indexOf('&')) >= 0) {
HtmlEntity entity = HtmlEntity.getEntity(s, i);
if (entity != null) {
writer.write(s.substring(0, i + 1));
if (COMMON_CODES.contains(entity.htmlCode))
writer.write(entity.htmlCode + ';');
else
writer.write("#" + entity.xmlCode + ";");
s = s.substring(s.indexOf(';', i) + 1);
} else {
writer.write(s.substring(0, i));
writer.write("&");
s = s.substring(i + 1);
}
}
return s;
}
private static class ConversionContext {
public String encoding;
Writer writer;
HTMLTokenizer tokenizer;
Stack path;
boolean xmlHeaderCreated;
boolean rootCreated;
ConversionContext(Reader reader, Writer writer, String encoding) {
this.tokenizer = new DefaultHTMLTokenizer(reader);
this.path = new Stack();
this.xmlHeaderCreated = false;
this.rootCreated = false;
this.writer = writer;
this.encoding = encoding;
}
}
}