org.jsoup.parser.PrefixXmlTreeBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler Show documentation
This is a open project of Java. The project integrated Apache Commons-VFS and Jsoup. It can be grabbing data much easy.
There is a newer version: 1.1.1
package org.jsoup.parser;
/**
 * 利用增加prefix的方式，騙過 Token.Read
 * 再於寫入StartTag及EndTag前，將TagName還原
 *
 * Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, new Parser( new PrefixXmlTreeBuilder("prefixName")  );}
 *
 * @author Abola Lee
 ****
 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
 * document.
 * Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}
 *
 * @author Jonathan Hedley
 */

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document.OutputSettings.Syntax;
import org.jsoup.nodes.*;
import org.jsoup.parser.Token.Comment;
import org.jsoup.parser.Token.Doctype;
import org.jsoup.parser.Token.EndTag;
import org.jsoup.parser.Token.StartTag;

import java.util.List;

public class PrefixXmlTreeBuilder extends XmlTreeBuilder {
    String prefix;
    public PrefixXmlTreeBuilder(String prefix) {
        this.prefix = prefix;
    }


    protected boolean process(Token token) {
        switch (token.type) {
            case StartTag:
                insert(token.asStartTag());
                break;
            case EndTag:
                popStackToClose(token.asEndTag());
                break;
            case Comment:
                insert(token.asComment().asStartTag());
                break;
            case Character:
                insert(token.asCharacter());
                break;
            case Doctype:
                insert(token.asDoctype());
                break;
            case EOF: // could put some normalisation here if desired
                break;
            default:
                Validate.fail("Unexpected token type: " + token.type);
        }

        return true;
    }


    protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
        super.initialiseParse(input, baseUri, errors);
        this.stack.add(this.doc);
        this.doc.outputSettings().syntax(Syntax.xml);
    }

    private void insertNode(Node node) {
        this.currentElement().appendChild(node);
    }

    Element insert(StartTag startTag) {
        // remove prefix
        Tag tag = Tag.valueOf(startTag.name().replace(this.prefix,""));
        Element el = new Element(tag, this.baseUri, startTag.attributes);
        this.insertNode(el);
        if(startTag.isSelfClosing()) {
            this.tokeniser.acknowledgeSelfClosingFlag();
            if(!tag.isKnownTag()) {
                tag.setSelfClosing();
            }
        } else {
            this.stack.add(el);
        }

        return el;
    }

    void insert(Comment commentToken) {

        org.jsoup.nodes.Comment comment = new org.jsoup.nodes.Comment(commentToken.getData(), this.baseUri);
        Object insert = comment;
        if(commentToken.bogus) {
            String data = comment.getData();
            if(data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) {
                String declaration = data.substring(1);
                insert = new XmlDeclaration(declaration, comment.baseUri(), data.startsWith("!"));
            }
        }

        this.insertNode((Node)insert);
    }

    void insert(Token.Character characterToken) {
        TextNode node = new TextNode(characterToken.getData(), this.baseUri);
        this.insertNode(node);
    }

    void insert(Doctype d) {
        DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), this.baseUri);
        this.insertNode(doctypeNode);
    }

    private void popStackToClose(EndTag endTag) {
        // remove prefix
        String elName = endTag.name().replace(this.prefix,"");
        Element firstFound = null;

        int pos;
        Element next;
        for(pos = this.stack.size() - 1; pos >= 0; --pos) {
            next = (Element)this.stack.get(pos);
            if(next.nodeName().equals(elName)) {
                firstFound = next;
                break;
            }
        }

        if(firstFound != null) {
            for(pos = this.stack.size() - 1; pos >= 0; --pos) {
                next = (Element)this.stack.get(pos);
                this.stack.remove(pos);
                if(next == firstFound) {
                    break;
                }
            }

        }
    }

    List parseFragment(String inputFragment, String baseUri, ParseErrorList errors) {
        this.initialiseParse(inputFragment, baseUri, errors);
        this.runParser();
        return this.doc.childNodes();
    }
}