org.htmlunit.DefaultPageCreator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of xlt Show documentation
Show all versions of xlt Show documentation
XLT (Xceptance LoadTest) is an extensive load and performance test tool developed and maintained by Xceptance.
/*
* Copyright (c) 2002-2023 Gargoyle Software Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.htmlunit;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.htmlunit.html.DomElement;
import org.htmlunit.html.Html;
import org.htmlunit.html.HtmlPage;
import org.htmlunit.html.XHtmlPage;
import org.htmlunit.html.parser.HTMLParser;
import org.htmlunit.html.parser.neko.HtmlUnitNekoHtmlParser;
import org.htmlunit.util.MimeType;
import org.htmlunit.xml.XmlPage;
/**
* The default implementation of {@link PageCreator}. Designed to be extended for easier handling of new content
* types. Just check the content type in createPage()
* and call super(createPage())
if your custom
* type isn't found. There are also protected createXXXXPage()
methods for creating the {@link Page} types
* which HtmlUnit already knows about for your custom content types.
*
*
* The following table shows the type of {@link Page} created depending on the content type:
*
*
*
* Content type
* Type of page
*
*
* text/html
* text/javascript
* {@link HtmlPage}
*
*
* text/xml
* application/xml
* text/vnd.wap.wml
* *+xml
*
* {@link XmlPage}, or an {@link XHtmlPage} if an XHTML namespace is used
*
*
* text/*
* {@link TextPage}
*
*
* Anything else
* {@link UnexpectedPage}
*
*
*
* @author Mike Bowler
* @author Christian Sell
* @author Brad Clarke
* @author Marc Guillemot
* @author Ahmed Ashour
* @author Daniel Gredler
* @author Ronald Brill
* @author Antoni Reus
*/
public class DefaultPageCreator implements PageCreator, Serializable {
private static final byte[] markerUTF8_ = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
private static final byte[] markerUTF16BE_ = {(byte) 0xfe, (byte) 0xff};
private static final byte[] markerUTF16LE_ = {(byte) 0xff, (byte) 0xfe};
/**
* See http://tools.ietf.org/html/draft-abarth-mime-sniff-05
*/
private static final String[] htmlPatterns = {"!DOCTYPE HTML", "HTML", "HEAD", "SCRIPT",
"IFRAME", "H1", "DIV", "FONT", "TABLE", "A", "STYLE", "TITLE", "B", "BODY", "BR", "P", "!--" };
private static final HTMLParser htmlParser_ = new HtmlUnitNekoHtmlParser();
/**
* The different supported page types.
*/
public enum PageType {
/** html. */
HTML,
/** javascript. */
JAVASCRIPT,
/** xml. */
XML,
/** text. */
TEXT,
/** unknown. */
UNKNOWN
}
/**
* Determines the kind of page to create from the content type.
* @param contentType the content type to evaluate
* @return "xml", "html", "javascript", "text" or "unknown"
*/
public static PageType determinePageType(final String contentType) {
if (null == contentType) {
return PageType.UNKNOWN;
}
final String contentTypeLC = org.htmlunit.util.StringUtils
.toRootLowerCaseWithCache(contentType);
switch (contentTypeLC) {
case MimeType.TEXT_HTML:
case "image/svg+xml":
return PageType.HTML;
case "text/javascript":
case "application/x-javascript":
case MimeType.APPLICATION_JAVASCRIPT:
return PageType.JAVASCRIPT;
case MimeType.TEXT_XML:
case "application/xml":
case "text/vnd.wap.wml":
return PageType.XML;
default:
if (contentTypeLC.endsWith("+xml")) {
return PageType.XML;
}
if (contentTypeLC.startsWith("text/")) {
return PageType.TEXT;
}
return PageType.UNKNOWN;
}
}
/**
* Determines the kind of page to create from the content type.
* @param webResponse the response to investigate
* @exception IOException if an IO problem occurs
* @return "xml", "html", "javascript", "text" or "unknown"
*/
public static PageType determinePageType(final WebResponse webResponse) throws IOException {
final String contentType = webResponse.getContentType();
if (!StringUtils.isEmpty(contentType)) {
return determinePageType(contentType);
}
// sniff - http://tools.ietf.org/html/draft-abarth-mime-sniff-05
try (InputStream contentAsStream = webResponse.getContentAsStream()) {
final byte[] bytes = read(contentAsStream, 512);
if (bytes.length == 0) {
return determinePageType(MimeType.TEXT_PLAIN);
}
if (startsWith(bytes, markerUTF8_) || startsWith(bytes, markerUTF16BE_)
|| startsWith(bytes, markerUTF16LE_)) {
return determinePageType(MimeType.TEXT_PLAIN);
}
if (isBinary(bytes)) {
return determinePageType(MimeType.APPLICATION_OCTET_STREAM);
}
final String asAsciiString = new String(bytes, StandardCharsets.US_ASCII).trim().toUpperCase(Locale.ROOT);
for (final String htmlPattern : htmlPatterns) {
try {
if ('<' == asAsciiString.charAt(0)) {
if (asAsciiString.startsWith(htmlPattern, 1)) {
final char spaceOrBracket = asAsciiString.charAt(htmlPattern.length() + 1);
if (' ' == spaceOrBracket || '>' == spaceOrBracket) {
return determinePageType(MimeType.TEXT_HTML);
}
}
}
}
catch (final ArrayIndexOutOfBoundsException e) {
// ignore and try next
}
}
}
return determinePageType(MimeType.TEXT_PLAIN);
}
/**
* Creates an instance.
*/
public DefaultPageCreator() {
// Empty.
}
/**
* Create a Page object for the specified web response.
*
* @param webResponse the response from the server
* @param webWindow the window that this page will be loaded into
* @exception IOException if an IO problem occurs
* @return the new page object
*/
@Override
public Page createPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
final PageType pageType = determinePageType(webResponse);
switch (pageType) {
case HTML:
return createHtmlPage(webResponse, webWindow);
case JAVASCRIPT:
return createHtmlPage(webResponse, webWindow);
case XML:
final SgmlPage sgmlPage = createXmlPage(webResponse, webWindow);
final DomElement doc = sgmlPage.getDocumentElement();
if (doc != null && Html.XHTML_NAMESPACE.equals(doc.getNamespaceURI())) {
return createXHtmlPage(webResponse, webWindow);
}
return sgmlPage;
case TEXT:
return createTextPage(webResponse, webWindow);
default:
return createUnexpectedPage(webResponse, webWindow);
}
}
/**
* {@inheritDoc}
*/
@Override
public HTMLParser getHtmlParser() {
return htmlParser_;
}
/**
* See http://tools.ietf.org/html/draft-abarth-mime-sniff-05#section-4
* @param bytes the bytes to check
*/
private static boolean isBinary(final byte[] bytes) {
for (final byte b : bytes) {
if ((b >= 0x00 && b < 0x08)
|| b == 0x0B
|| (b >= 0x0E && b <= 0x1A)
|| (b >= 0x1C && b <= 0x1F)) {
return true;
}
}
return false;
}
private static boolean startsWith(final byte[] bytes, final byte[] lookFor) {
if (bytes.length < lookFor.length) {
return false;
}
for (int i = 0; i < lookFor.length; i++) {
if (bytes[i] != lookFor[i]) {
return false;
}
}
return true;
}
private static byte[] read(final InputStream stream, final int maxNb) throws IOException {
final byte[] buffer = new byte[maxNb];
final int nbRead = stream.read(buffer);
if (nbRead == buffer.length) {
return buffer;
}
return ArrayUtils.subarray(buffer, 0, nbRead);
}
/**
* Creates an HtmlPage for this WebResponse.
*
* @param webResponse the page's source
* @param webWindow the WebWindow to place the HtmlPage in
* @return the newly created HtmlPage
* @throws IOException if the page could not be created
*/
protected HtmlPage createHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
final HtmlPage page = new HtmlPage(webResponse, webWindow);
webWindow.setEnclosedPage(page);
htmlParser_.parse(webResponse, page, false, false);
return page;
}
/**
* Creates an XHtmlPage for this WebResponse.
*
* @param webResponse the page's source
* @param webWindow the WebWindow to place the HtmlPage in
* @return the newly created XHtmlPage
* @throws IOException if the page could not be created
*/
protected XHtmlPage createXHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
final XHtmlPage page = new XHtmlPage(webResponse, webWindow);
webWindow.setEnclosedPage(page);
htmlParser_.parse(webResponse, page, true, false);
return page;
}
/**
* Creates a TextPage for this WebResponse.
*
* @param webResponse the page's source
* @param webWindow the WebWindow to place the TextPage in
* @return the newly created TextPage
*/
protected TextPage createTextPage(final WebResponse webResponse, final WebWindow webWindow) {
final TextPage newPage = new TextPage(webResponse, webWindow);
webWindow.setEnclosedPage(newPage);
return newPage;
}
/**
* Creates an UnexpectedPage for this WebResponse.
*
* @param webResponse the page's source
* @param webWindow the WebWindow to place the UnexpectedPage in
* @return the newly created UnexpectedPage
*/
protected UnexpectedPage createUnexpectedPage(final WebResponse webResponse, final WebWindow webWindow) {
final UnexpectedPage newPage = new UnexpectedPage(webResponse, webWindow);
webWindow.setEnclosedPage(newPage);
return newPage;
}
/**
* Creates an SgmlPage for this WebResponse.
*
* @param webResponse the page's source
* @param webWindow the WebWindow to place the TextPage in
* @return the newly created TextPage
* @throws IOException if the page could not be created
*/
protected SgmlPage createXmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
final SgmlPage page = new XmlPage(webResponse, webWindow);
webWindow.setEnclosedPage(page);
return page;
}
}