All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsoup.parser.Parser Maven / Gradle / Ivy

There is a newer version: 1.18.3
Show newest version
package org.jsoup.parser;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import java.io.Reader;
import java.io.StringReader;
import java.util.List;

/**
 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
 {@link org.jsoup.Jsoup}.
 

Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded environment, use {@link #newInstance()} to make copies. */ public class Parser { public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; private TreeBuilder treeBuilder; private ParseErrorList errors; private ParseSettings settings; private boolean trackPosition = false; /** * Create a new Parser, using the specified TreeBuilder * @param treeBuilder TreeBuilder to use to parse input into Documents. */ public Parser(TreeBuilder treeBuilder) { this.treeBuilder = treeBuilder; settings = treeBuilder.defaultSettings(); errors = ParseErrorList.noTracking(); } /** Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. @return a copied parser */ public Parser newInstance() { return new Parser(this); } private Parser(Parser copy) { treeBuilder = copy.treeBuilder.newInstance(); // because extended errors = new ParseErrorList(copy.errors); // only copies size, not contents settings = new ParseSettings(copy.settings); trackPosition = copy.trackPosition; } public Document parseInput(String html, String baseUri) { return treeBuilder.parse(new StringReader(html), baseUri, this); } public Document parseInput(Reader inputHtml, String baseUri) { return treeBuilder.parse(inputHtml, baseUri, this); } public List parseFragmentInput(String fragment, Element context, String baseUri) { return treeBuilder.parseFragment(fragment, context, baseUri, this); } // gets & sets /** * Get the TreeBuilder currently in use. * @return current TreeBuilder. */ public TreeBuilder getTreeBuilder() { return treeBuilder; } /** * Update the TreeBuilder used when parsing content. * @param treeBuilder new TreeBuilder * @return this, for chaining */ public Parser setTreeBuilder(TreeBuilder treeBuilder) { this.treeBuilder = treeBuilder; treeBuilder.parser = this; return this; } /** * Check if parse error tracking is enabled. * @return current track error state. */ public boolean isTrackErrors() { return errors.getMaxSize() > 0; } /** * Enable or disable parse error tracking for the next parse. * @param maxErrors the maximum number of errors to track. Set to 0 to disable. * @return this, for chaining */ public Parser setTrackErrors(int maxErrors) { errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); return this; } /** * Retrieve the parse errors, if any, from the last parse. * @return list of parse errors, up to the size of the maximum errors tracked. * @see #setTrackErrors(int) */ public ParseErrorList getErrors() { return errors; } /** Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input source they were created from. By default, tracking is not enabled. * @return current track position setting */ public boolean isTrackPosition() { return trackPosition; } /** Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original input source they were created from. @param trackPosition position tracking setting; {@code true} to enable @return this Parser, for chaining */ public Parser setTrackPosition(boolean trackPosition) { this.trackPosition = trackPosition; return this; } /** Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. * @param settings the new settings * @return this Parser */ public Parser settings(ParseSettings settings) { this.settings = settings; return this; } /** Gets the current ParseSettings for this Parser * @return current ParseSettings */ public ParseSettings settings() { return settings; } /** (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as Data Nodes). */ public boolean isContentForTagData(String normalName) { return getTreeBuilder().isContentForTagData(normalName); } public String defaultNamespace() { return getTreeBuilder().defaultNamespace(); } // static parse functions below /** * Parse HTML into a Document. * * @param html HTML to parse * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return parsed Document */ public static Document parse(String html, String baseUri) { TreeBuilder treeBuilder = new HtmlTreeBuilder(); return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); } /** * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. * * @param fragmentHtml the fragment of HTML to parse * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This * provides stack context (for implicit element creation). * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. */ public static List parseFragment(String fragmentHtml, Element context, String baseUri) { HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); return treeBuilder.parseFragment(fragmentHtml, context, baseUri, new Parser(treeBuilder)); } /** * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. * * @param fragmentHtml the fragment of HTML to parse * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This * provides stack context (for implicit element creation). * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * @param errorList list to add errors to * * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. */ public static List parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); Parser parser = new Parser(treeBuilder); parser.errors = errorList; return treeBuilder.parseFragment(fragmentHtml, context, baseUri, parser); } /** * Parse a fragment of XML into a list of nodes. * * @param fragmentXml the fragment of XML to parse * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * @return list of nodes parsed from the input XML. */ public static List parseXmlFragment(String fragmentXml, String baseUri) { XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder)); } /** * Parse a fragment of HTML into the {@code body} of a Document. * * @param bodyHtml fragment of HTML * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return Document, with empty head, and HTML parsed into body */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { Document doc = Document.createShell(baseUri); Element body = doc.body(); List nodeList = parseFragment(bodyHtml, body, baseUri); Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented for (int i = nodes.length - 1; i > 0; i--) { nodes[i].remove(); } for (Node node : nodes) { body.appendChild(node); } return doc; } /** * Utility method to unescape HTML entities from a string * @param string HTML escaped string * @param inAttribute if the string is to be escaped in strict mode (as attributes are) * @return an unescaped string */ public static String unescapeEntities(String string, boolean inAttribute) { Parser parser = Parser.htmlParser(); parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); return tokeniser.unescapeEntities(inAttribute); } // builders /** * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, * based on a knowledge of the semantics of the incoming tags. * @return a new HTML parser. */ public static Parser htmlParser() { return new Parser(new HtmlTreeBuilder()); } /** * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, * rather creates a simple tree directly from the input. * @return a new simple XML parser. */ public static Parser xmlParser() { return new Parser(new XmlTreeBuilder()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy