All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.opensymphony.module.sitemesh.parser.HTMLPageParser Maven / Gradle / Ivy

package com.opensymphony.module.sitemesh.parser;

import com.opensymphony.module.sitemesh.DefaultSitemeshBuffer;
import com.opensymphony.module.sitemesh.Page;
import com.opensymphony.module.sitemesh.PageParser;
import com.opensymphony.module.sitemesh.SitemeshBuffer;
import com.opensymphony.module.sitemesh.SitemeshBufferFragment;
import com.opensymphony.module.sitemesh.html.HTMLProcessor;
import com.opensymphony.module.sitemesh.html.State;
import com.opensymphony.module.sitemesh.html.StateTransitionRule;
import com.opensymphony.module.sitemesh.html.util.CharArray;
import com.opensymphony.module.sitemesh.html.rules.BodyTagRule;
import com.opensymphony.module.sitemesh.html.rules.ContentBlockExtractingRule;
import com.opensymphony.module.sitemesh.html.rules.FramesetRule;
import com.opensymphony.module.sitemesh.html.rules.HeadExtractingRule;
import com.opensymphony.module.sitemesh.html.rules.HtmlAttributesRule;
import com.opensymphony.module.sitemesh.html.rules.MSOfficeDocumentPropertiesRule;
import com.opensymphony.module.sitemesh.html.rules.MetaTagRule;
import com.opensymphony.module.sitemesh.html.rules.ParameterExtractingRule;
import com.opensymphony.module.sitemesh.html.rules.TitleExtractingRule;
import com.opensymphony.module.sitemesh.html.rules.PageBuilder;

import java.io.IOException;

/**
 * 

Builds an HTMLPage object from an HTML document. This behaves * similarly to the FastPageParser, however it's a complete rewrite that is simpler to add custom features to such as * extraction and transformation of elements.

* *

To customize the rules used, this class can be extended and have the userDefinedRules() methods overridden.

* * @author Joe Walnes * * @see HTMLProcessor */ public class HTMLPageParser implements PageParser { public Page parse(char[] buffer) throws IOException { return parse(new DefaultSitemeshBuffer(buffer)); } public Page parse(SitemeshBuffer buffer) throws IOException { SitemeshBufferFragment.Builder head = SitemeshBufferFragment.builder().setBuffer(buffer).setLength(0); SitemeshBufferFragment.Builder body = SitemeshBufferFragment.builder().setBuffer(buffer); TokenizedHTMLPage page = new TokenizedHTMLPage(buffer); HTMLProcessor processor = new HTMLProcessor(buffer, body); State html = processor.defaultState(); // Core rules for SiteMesh to be functional. html.addRule(new HeadExtractingRule(head)); // contents of html.addRule(new BodyTagRule(page, body)); // contents of html.addRule(new TitleExtractingRule(page)); // the html.addRule(new FramesetRule(page)); // if the page is a frameset // Additional rules - designed to be tweaked. addUserDefinedRules(html, page); processor.process(); page.setBody(body.build()); page.setHead(head.build()); return page; } protected void addUserDefinedRules(State html, PageBuilder page) { // Ensure that while in <xml> tag, none of the other rules kick in. // For example <xml><book><title>hello should not change the affect the title of the page. State xml = new State(); html.addRule(new StateTransitionRule("xml", xml)); // Useful properties html.addRule(new HtmlAttributesRule(page)); // attributes in element html.addRule(new MetaTagRule(page)); // all tags html.addRule(new ParameterExtractingRule(page)); // blocks html.addRule(new ContentBlockExtractingRule(page)); // blocks // Capture properties written to documents by MS Office (author, version, company, etc). // Note: These properties are from the xml state, not the html state. xml.addRule(new MSOfficeDocumentPropertiesRule(page)); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy