All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jodd.lagarto.dom.LagartoDOMBuilder Maven / Gradle / Ivy

Go to download

Jodd Lagarto is fast and versatile all purpose HTML parser. Includes Jerry and CSSelly.

There is a newer version: 6.0.6
Show newest version
// Copyright (c) 2003-present, Jodd Team (http://jodd.org)
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package jodd.lagarto.dom;

import jodd.lagarto.LagartoParser;

/**
 * Lagarto DOM builder creates DOM tree from HTML, XHTML or XML content.
 */
public class LagartoDOMBuilder implements DOMBuilder {

	public LagartoDOMBuilder() {
		enableHtmlMode();
	}

	// ---------------------------------------------------------------- config

	protected LagartoDomBuilderConfig config = new LagartoDomBuilderConfig();

	public LagartoDomBuilderConfig getConfig() {
		return config;
	}

	public void setConfig(final LagartoDomBuilderConfig config) {
		this.config = config;
	}

	// ---------------------------------------------------------------- quick settings

	/**
	 * Enables debug mode. Performances are lost.
	 */
	public LagartoDOMBuilder enableDebug() {
		config.collectErrors = true;
		config.setCalculatePosition(true);
		return this;
	}

	/**
	 * Disables debug mode.
	 */
	public LagartoDOMBuilder disableDebug() {
		config.collectErrors = false;
		config.setCalculatePosition(false);
		return this;
	}

	/**
	 * Enables {@link #enableHtmlMode() html mode} with additional
	 * and somewhat experimental rules.
	 */
	public LagartoDOMBuilder enableHtmlPlusMode() {
		enableHtmlMode();
		config.useFosterRules = true;
		config.unclosedTagAsOrphanCheck = true;
		return this;
	}


	/**
	 * Enables HTML5 parsing mode.
	 */
	public LagartoDOMBuilder enableHtmlMode() {
		config.ignoreWhitespacesBetweenTags = false;			// collect all whitespaces
		config.setCaseSensitive(false);							// HTML is case insensitive
		config.setEnableRawTextModes(true);						// script and style tags are parsed as CDATA
		config.enabledVoidTags = true;							// list of void tags
		config.selfCloseVoidTags = false;						// don't self close void tags
		config.impliedEndTags = true;							// some tags end is implied
		config.setEnableConditionalComments(false);				// don't enable IE conditional comments
		config.setParseXmlTags(false);							// enable XML mode in parsing
		return this;
	}

	/**
	 * Enables XHTML mode.
	 */
	public LagartoDOMBuilder enableXhtmlMode() {
		config.ignoreWhitespacesBetweenTags = false;			// collect all whitespaces
		config.setCaseSensitive(true);							// XHTML is case sensitive
		config.setEnableRawTextModes(false);					// all tags are parsed in the same way
		config.enabledVoidTags = true;							// list of void tags
		config.selfCloseVoidTags = true;						// self close void tags
		config.impliedEndTags = false;							// no implied tag ends
		config.setEnableConditionalComments(false);				// don't enable IE conditional comments
		config.setParseXmlTags(false);							// enable XML mode in parsing
		return this;
	}

	/**
	 * Enables XML parsing mode.
	 */
	public LagartoDOMBuilder enableXmlMode() {
		config.ignoreWhitespacesBetweenTags = true;				// ignore whitespaces that are non content
		config.setCaseSensitive(true);							// XML is case sensitive
		config.setEnableRawTextModes(false);					// all tags are parsed in the same way
		config.enabledVoidTags = false;							// there are no void tags
		config.selfCloseVoidTags = false;						// don't self close empty tags (can be changed!)
		config.impliedEndTags = false;							// no implied tag ends
		config.setEnableConditionalComments(false);				// disable IE conditional comments
		config.setParseXmlTags(true);							// enable XML mode in parsing
		return this;
	}

	// ---------------------------------------------------------------- parse

	/**
	 * Creates DOM tree from provided content.
	 */
	@Override
	public Document parse(final char[] content) {
		LagartoParser lagartoParser = new LagartoParser(content);
		return doParse(lagartoParser);
	}

	/**
	 * Creates DOM tree from the provided content.
	 */
	@Override
	public Document parse(final String content) {
		LagartoParser lagartoParser = new LagartoParser(content);
		return doParse(lagartoParser);
	}

	/**
	 * Parses the content using provided lagarto parser.
	 */
	protected Document doParse(final LagartoParser lagartoParser) {
		lagartoParser.setConfig(config);

		LagartoDOMBuilderTagVisitor domBuilderTagVisitor =
				new LagartoDOMBuilderTagVisitor(this);

		lagartoParser.parse(domBuilderTagVisitor);

		return domBuilderTagVisitor.getDocument();
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy