org.htmlunit.html.XmlSerializer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of htmlunit Show documentation
Show all versions of htmlunit Show documentation
A headless browser intended for use in testing web-based applications.
The newest version!
/*
* Copyright (c) 2002-2024 Gargoyle Software Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.htmlunit.html;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlunit.Page;
import org.htmlunit.SgmlPage;
import org.htmlunit.WebResponse;
import org.htmlunit.util.MimeType;
/**
* Utility to handle conversion from HTML code to XML string.
* @author Ahmed Ashour
* @author Ronald Brill
* @author Marc Guillemot
*/
public class XmlSerializer {
private static final String FILE_SEPARATOR = "/";
private static final Pattern CREATE_FILE_PATTERN = Pattern.compile(".*/");
private static final Log LOG = LogFactory.getLog(XmlSerializer.class);
private final StringBuilder builder_ = new StringBuilder();
private final StringBuilder indent_ = new StringBuilder();
private File outputDir_;
public void save(final SgmlPage page, final File file) throws IOException {
save(page, file, false);
}
private void save(final SgmlPage page, final File file, final boolean append) throws IOException {
String fileName = file.getName();
if (!append) {
if (!fileName.endsWith(".htm") && !fileName.endsWith(".html")) {
fileName += ".html";
}
}
final File outputFile = new File(file.getParentFile(), fileName);
if (!append && outputFile.exists()) {
throw new IOException("File already exists: " + outputFile);
}
fileName = fileName.substring(0, fileName.lastIndexOf('.'));
outputDir_ = new File(file.getParentFile(), fileName);
// don't use asXml here because we have to sync the encoding from the
// header with the one used by the writer
final DomElement node = page.getDocumentElement();
Charset charsetName = ISO_8859_1;
builder_.setLength(0);
indent_.setLength(0);
if (page.isHtmlPage()) {
charsetName = page.getCharset();
if (charsetName != null && node instanceof HtmlHtml) {
builder_.append("\n");
}
}
printXml(node);
final String response = builder_.toString();
builder_.setLength(0);
FileUtils.writeStringToFile(outputFile, response, charsetName, append);
}
/**
* @param node a node
* @return the xml representation according to the setting of this serializer
* @throws IOException in case of problem saving resources
*/
public String asXml(final DomElement node) throws IOException {
builder_.setLength(0);
indent_.setLength(0);
final SgmlPage page = node.getPage();
if (null != page && page.isHtmlPage()) {
final Charset charsetName = page.getCharset();
if (charsetName != null && node instanceof HtmlHtml) {
builder_.append("\n");
}
}
printXml(node);
final String response = builder_.toString();
builder_.setLength(0);
return response;
}
protected void printXml(final DomElement node) throws IOException {
if (!isExcluded(node)) {
final boolean hasChildren = node.getFirstChild() != null;
builder_.append(indent_).append('<');
printOpeningTag(node);
if (!hasChildren && !node.isEmptyXmlTagExpanded()) {
builder_.append("/>\n");
}
else {
builder_.append(">\n");
for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
indent_.append(" ");
if (child instanceof DomElement) {
printXml((DomElement) child);
}
else {
builder_.append(child);
}
indent_.setLength(indent_.length() - 2);
}
builder_.append(indent_).append("").append(node.getTagName()).append(">\n");
}
}
}
/**
* @param node a node
* @return the text representation according to the setting of this serializer
*/
public String asText(final DomNode node) {
builder_.setLength(0);
if (node instanceof DomText) {
builder_.append(((DomText) node).getData());
}
else {
printText(node);
}
final String response = builder_.toString();
builder_.setLength(0);
return response;
}
/**
* Prints the text content from this node and all children.
* @param node the node
*/
protected void printText(final DomNode node) {
for (DomNode child = node.getFirstChild(); child != null; child = child.getNextSibling()) {
if (child instanceof DomText) {
builder_.append(((DomText) child).getData());
}
else {
printText(child);
}
}
}
/**
* Prints the content between "<" and ">" (or "/>") in the output of the tag name
* and its attributes in XML format.
* @param node the node whose opening tag is to be printed
* @throws IOException in case of problem saving resources
*/
protected void printOpeningTag(final DomElement node) throws IOException {
builder_.append(node.getTagName());
final Map attributes = readAttributes(node);
for (final Map.Entry entry : attributes.entrySet()) {
builder_.append(' ')
.append(entry.getKey())
.append("=\"");
final String value = entry.getValue().getNodeValue();
builder_.append(org.htmlunit.util.StringUtils.escapeXmlAttributeValue(value))
.append('"');
}
}
private Map readAttributes(final DomElement node) throws IOException {
if (node instanceof HtmlImage) {
return getAttributesFor((HtmlImage) node);
}
else if (node instanceof HtmlLink) {
return getAttributesFor((HtmlLink) node);
}
else if (node instanceof BaseFrameElement) {
return getAttributesFor((BaseFrameElement) node);
}
Map attributes = node.getAttributesMap();
if (node instanceof HtmlOption) {
attributes = new HashMap<>(attributes);
final HtmlOption option = (HtmlOption) node;
if (option.isSelected()) {
if (!attributes.containsKey("selected")) {
attributes.put("selected", new DomAttr(node.getPage(), null, "selected", "selected", false));
}
}
else {
attributes.remove("selected");
}
}
return attributes;
}
/**
* @param frame the frame to get the attributes from
* @return the attribute map
*/
private Map getAttributesFor(final BaseFrameElement frame) throws IOException {
final Map map = createAttributesCopyWithClonedAttribute(frame, DomElement.SRC_ATTRIBUTE);
final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
if (srcAttr == null) {
return map;
}
final Page enclosedPage = frame.getEnclosedPage();
final String suffix = getFileExtension(enclosedPage);
final File file = createFile(srcAttr.getValue(), "." + suffix);
if (enclosedPage != null) {
if (enclosedPage.isHtmlPage()) {
new XmlSerializer().save((HtmlPage) enclosedPage, file, true);
}
else {
try (InputStream is = enclosedPage.getWebResponse().getContentAsStream()) {
try (OutputStream fos = Files.newOutputStream(file.toPath())) {
IOUtils.copyLarge(is, fos);
}
}
}
}
srcAttr.setValue(file.getParentFile().getName() + FILE_SEPARATOR + file.getName());
return map;
}
private static String getFileExtension(final Page enclosedPage) {
if (enclosedPage != null) {
if (enclosedPage.isHtmlPage()) {
return "html";
}
final URL url = enclosedPage.getUrl();
if (url.getPath().contains(".")) {
return StringUtils.substringAfterLast(url.getPath(), ".");
}
}
return ".unknown";
}
/**
* @param link the link to get the attributes from
* @return the attribute map
* @throws IOException in case of error
*/
protected Map getAttributesFor(final HtmlLink link) throws IOException {
final Map map = createAttributesCopyWithClonedAttribute(link, "href");
final DomAttr hrefAttr = map.get("href");
if (hrefAttr != null && StringUtils.isNotBlank(hrefAttr.getValue())) {
final String protocol = link.getWebRequest().getUrl().getProtocol();
if ("http".equals(protocol) || "https".equals(protocol)) {
try {
final WebResponse response = link.getWebResponse(true, null);
final File file = createFile(hrefAttr.getValue(), ".css");
FileUtils.writeStringToFile(file, response.getContentAsString(), ISO_8859_1);
hrefAttr.setValue(outputDir_.getName() + FILE_SEPARATOR + file.getName());
}
catch (final IOException e) {
LOG.error("XmlSerializer: IOException while downloading link content from url '"
+ hrefAttr + "'", e);
}
catch (final IllegalStateException e) {
LOG.error("XmlSerializer: IllegalStateException while downloading link content from url '"
+ hrefAttr + "'", e);
}
}
}
return map;
}
/**
* @param image the image to get the attributes from
* @return the attribute map
*/
protected Map getAttributesFor(final HtmlImage image) {
final Map map = createAttributesCopyWithClonedAttribute(image, DomElement.SRC_ATTRIBUTE);
final DomAttr srcAttr = map.get(DomElement.SRC_ATTRIBUTE);
if (srcAttr != null && StringUtils.isNotBlank(srcAttr.getValue())) {
try {
final WebResponse response = image.getWebResponse(true);
try (InputStream inputStream = response.getContentAsStream()) {
final File file = createFile(srcAttr.getValue(), "." + getSuffix(response));
FileUtils.copyInputStreamToFile(inputStream, file);
final String valueOnFileSystem = outputDir_.getName() + FILE_SEPARATOR + file.getName();
// this is the clone attribute node, not the original one of the page
srcAttr.setValue(valueOnFileSystem);
}
}
catch (final IOException e) {
LOG.error("XmlSerializer: IOException while downloading image content from url '" + srcAttr + "'", e);
}
catch (final IllegalStateException e) {
LOG.error("XmlSerializer: IllegalStateException while downloading image content from url '"
+ srcAttr + "'", e);
}
}
return map;
}
private static String getSuffix(final WebResponse response) {
// first try to take the one from the requested file
final String url = response.getWebRequest().getUrl().toString();
final String fileName = StringUtils.substringAfterLast(StringUtils.substringBefore(url, "?"), "/");
// if there is a suffix with 2-4 letters, the take it
final String suffix = StringUtils.substringAfterLast(fileName, ".");
if (suffix.length() > 1 && suffix.length() < 5) {
return suffix;
}
// use content type
return MimeType.getFileExtension(response.getContentType());
}
private static Map createAttributesCopyWithClonedAttribute(final HtmlElement elt,
final String attrName) {
final Map newMap = new HashMap<>(elt.getAttributesMap());
// clone the specified element, if possible
final DomAttr attr = newMap.get(attrName);
if (null == attr) {
return newMap;
}
final DomAttr clonedAttr = new DomAttr(attr.getPage(), attr.getNamespaceURI(),
attr.getQualifiedName(), attr.getValue(), attr.getSpecified());
newMap.put(attrName, clonedAttr);
return newMap;
}
/**
* @param element the element to check
* @return true if the element is a HtmlScript
*/
protected boolean isExcluded(final DomElement element) {
return element instanceof HtmlScript;
}
/**
* Computes the best file to save the response to the given URL.
* @param url the requested URL
* @param extension the preferred extension
* @return the file to create
* @throws IOException if a problem occurs creating the file
*/
private File createFile(final String url, final String extension) throws IOException {
String name = url.replaceFirst("/$", "");
name = CREATE_FILE_PATTERN.matcher(name).replaceAll("");
name = StringUtils.substringBefore(name, "?"); // remove query
name = StringUtils.substringBefore(name, ";"); // remove additional info
name = StringUtils.substring(name, 0, 30); // many file systems have a limit at 255, let's limit it
name = org.htmlunit.util.StringUtils.sanitizeForFileName(name);
if (!name.endsWith(extension)) {
name += extension;
}
int counter = 0;
while (true) {
final String fileName;
if (counter != 0) {
fileName = StringUtils.substringBeforeLast(name, ".")
+ "_" + counter + "." + StringUtils.substringAfterLast(name, ".");
}
else {
fileName = name;
}
FileUtils.forceMkdir(outputDir_);
final File f = new File(outputDir_, fileName);
if (f.createNewFile()) {
return f;
}
counter++;
}
}
}