info.novatec.testit.livingdoc.html.HtmlDocumentBuilder Maven / Gradle / Ivy
/* Copyright (c) 2006 Pyxis Technologies inc.
*
* This is free software; you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option) any later
* version.
*
* This software is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the FSF site:
* http://www.fsf.org. */
package info.novatec.testit.livingdoc.html;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import info.novatec.testit.livingdoc.Example;
import info.novatec.testit.livingdoc.TextExample;
import info.novatec.testit.livingdoc.document.Document;
import info.novatec.testit.livingdoc.repository.DocumentBuilder;
import info.novatec.testit.livingdoc.util.CollectionUtil;
public class HtmlDocumentBuilder implements DocumentBuilder {
private final List tags;
private final List filters = new ArrayList();
public static HtmlDocumentBuilder tables() {
return new HtmlDocumentBuilder("table", "tr", "td th");
}
public static HtmlDocumentBuilder tablesAndLists() {
return new HtmlDocumentBuilder("table ul ol", "table>tr ol>li ul>li", "tr>td tr>th li>span li>b li>i li>u li>em")
.addFilter(new BulletListFilter());
}
public HtmlDocumentBuilder(String... tags) {
this(Arrays.asList(tags));
}
public HtmlDocumentBuilder(List tags) {
if (tags.isEmpty()) {
throw new IllegalArgumentException("Specify at least a tag");
}
this.tags = new ArrayList(tags);
}
@Override
public Document build(Reader reader) throws IOException {
String html = IOUtils.toString(reader);
Example example = parse(html);
if (example == null) {
example = new TextExample(html);
}
return Document.html(example, name(html), externalLink(html));
}
public Example parse(String html) {
String text = removeComments(html);
return doParse(text);
}
private Example doParse(String text) {
if (pathologicalCase(text)) {
return null;
}
Matcher match = match(text);
if ( ! match.find()) {
return null;
}
Example child = hasChild(match) ? childParser(match).doParse(content(match)) : null;
Example sibling = hasSibling(match) ? doParse(tail(match)) : null;
return new HtmlExample(lead(match), startTag(match), tag(match), content(match), endTag(match), tail(match),
childTags(match), child, sibling);
}
/**
* Most regex implementations today do not build a DFA / NFA -- especially
* those that offer backreferences (which are not "regular" at all).
*
* And because they do NOT build DFAs and NFAs, it's very simple to
* construct pathological cases - e.g., "((a*)(a*))+b" can take
* exponentially long to decide that aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaac is
* not in the language using the matching techniques commonly in use.
*
* Ori Berger
*
* @param text The text to check
* @return boolean true, if the text matches pathological case, else false
*/
private boolean pathologicalCase(String text) {
String tag = toRegex(CollectionUtil.first(tags));
String regex = String.format("(?is)(<\\s*(%s)\\s*.*?>)", tag);
Matcher match = compile(regex).matcher(text);
if (match.find()) {
regex = String.format("(?is)(.*?)(<\\s*/\\s*(%s)\\s*.*?>)", tag);
return ! compile(regex).matcher(text).find(match.end());
}
return true;
}
private Matcher match(String text) {
return compile(firstTag(tags)).matcher(text);
}
private boolean hasSibling(Matcher match) {
return tail(match).matches(firstTag(tags));
}
private HtmlDocumentBuilder childParser(Matcher match) {
return new HtmlDocumentBuilder(childTags(match)).addAllFilters(filters);
}
private boolean hasChild(Matcher match) {
return ! childTags(match).isEmpty() && content(match).matches(firstTag(childTags(match)));
}
private List childTags(Matcher matcher) {
List moreTags = new ArrayList();
for (int i = 1; i < tags.size(); i ++ ) {
moreTags.add(stripSelector(tag(matcher), tags.get(i)));
}
return moreTags;
}
private String firstTag(List listTags) {
return elementPattern(toRegex(CollectionUtil.first(listTags)));
}
private String tail(Matcher matcher) {
return matcher.group(6);
}
private String endTag(Matcher matcher) {
return matcher.group(5);
}
private String content(Matcher matcher) {
String content = matcher.group(4);
for (int i = filters.size() - 1; i >= 0; i -- ) {
HtmlContentFilter filter = filters.get(i);
if (filter.handles(tag(matcher))) {
return filter.process(content);
}
}
return content;
}
private String tag(Matcher matcher) {
return matcher.group(3).toLowerCase();
}
private String startTag(Matcher matcher) {
return matcher.group(2);
}
private String lead(Matcher matcher) {
return matcher.group(1);
}
private Pattern compile(String regex) {
return Pattern.compile(regex);
}
private String removeComments(String html) {
return html.replaceAll("(?is)", "");
}
private String stripSelector(String selector, String tag) {
return tag.replaceAll(selector + ">", "");
}
private String elementPattern(String t) {
return String.format("(?is)(.*?)(<\\s*(%s)\\s*.*?>)(.*?)(<\\s*/\\s*\\3\\s*>)(.*)", t);
}
private String toRegex(String tag) {
return tag.replaceAll(" ", "|");
}
private String name(String content) {
return meta(content, "title");
}
private String externalLink(String content) {
return meta(content, "external-link");
}
private String meta(String content, String name) {
Matcher matcher = compile(String.format("", name)).matcher(content);
return matcher.find() ? matcher.group(1) : null;
}
public HtmlDocumentBuilder useTags(String... paramTags) {
this.tags.clear();
this.tags.addAll(Arrays.asList(paramTags));
return this;
}
public HtmlDocumentBuilder addAllFilters(List listFilters) {
this.filters.addAll(listFilters);
return this;
}
public HtmlDocumentBuilder addFilter(HtmlContentFilter... filter) {
return addAllFilters(Arrays.asList(filter));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy