com.itextpdf.styledxmlparser.jsoup.parser.TreeBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of styled-xml-parser Show documentation
Show all versions of styled-xml-parser Show documentation
Styled XML parser is used by iText modules to parse HTML and XML
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2024 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
package com.itextpdf.styledxmlparser.jsoup.parser;
import com.itextpdf.styledxmlparser.jsoup.helper.Validate;
import com.itextpdf.styledxmlparser.jsoup.nodes.Attributes;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
public abstract class TreeBuilder {
protected Parser parser;
CharacterReader reader;
Tokeniser tokeniser;
protected Document doc; // current doc we are building into
protected ArrayList stack; // the stack of open elements
protected String baseUri; // current base uri, for creating new elements
protected Token currentToken; // currentToken is used only for error tracking.
protected ParseSettings settings;
private Token.StartTag start = new Token.StartTag(); // start tag to process
private Token.EndTag end = new Token.EndTag();
abstract ParseSettings defaultSettings();
protected void initialiseParse(Reader input, String baseUri, Parser parser) {
Validate.notNull(input, "String input must not be null");
Validate.notNull(baseUri, "BaseURI must not be null");
Validate.notNull(parser);
doc = new Document(baseUri);
doc.parser(parser);
this.parser = parser;
settings = parser.settings();
reader = new CharacterReader(input);
currentToken = null;
tokeniser = new Tokeniser(reader, parser.getErrors());
stack = new ArrayList<>(32);
this.baseUri = baseUri;
}
Document parse(Reader input, String baseUri, Parser parser) {
initialiseParse(input, baseUri, parser);
runParser();
// tidy up - as the Parser and Treebuilder are retained in document for settings / fragments
reader.close();
reader = null;
tokeniser = null;
stack = null;
return doc;
}
/**
Create a new copy of this TreeBuilder
@return copy, ready for a new parse
*/
abstract TreeBuilder newInstance();
abstract List parseFragment(String inputFragment, Element context, String baseUri, Parser parser);
protected void runParser() {
final Tokeniser tokeniser = this.tokeniser;
final Token.TokenType eof = Token.TokenType.EOF;
while (true) {
Token token = tokeniser.read();
process(token);
token.reset();
if (token.type == eof)
break;
}
}
protected abstract boolean process(Token token);
protected boolean processStartTag(String name) {
final Token.StartTag start = this.start;
if (currentToken == start) { // don't recycle an in-use token
return process(new Token.StartTag().name(name));
}
return process(((Token.Tag) start.reset()).name(name));
}
public boolean processStartTag(String name, Attributes attrs) {
final Token.StartTag start = this.start;
if (currentToken == start) { // don't recycle an in-use token
return process(new Token.StartTag().nameAttr(name, attrs));
}
start.reset();
start.nameAttr(name, attrs);
return process(start);
}
protected boolean processEndTag(String name) {
if (currentToken == end) { // don't recycle an in-use token
return process(new Token.EndTag().name(name));
}
return process(((Token.Tag) end.reset()).name(name));
}
protected Element currentElement() {
int size = stack.size();
return size > 0 ? stack.get(size-1) : null;
}
/**
* If the parser is tracking errors, add an error at the current position.
* @param msg error message
*/
protected void error(String msg) {
ParseErrorList errors = parser.getErrors();
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), msg));
}
/**
(An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
Data Nodes).
*/
protected boolean isContentForTagData(String normalName) {
return false;
}
}