All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.greenpepper.html.HtmlDocumentBuilder Maven / Gradle / Ivy

/*
 * Copyright (c) 2006 Pyxis Technologies inc.
 *
 * This is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA,
 * or see the FSF site: http://www.fsf.org.
 */

package com.greenpepper.html;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.greenpepper.shaded.org.slf4j.Logger;

import com.greenpepper.Example;
import com.greenpepper.TextExample;
import com.greenpepper.dialect.SpecificationDialect;
import com.greenpepper.document.Document;
import com.greenpepper.repository.DocumentBuilder;
import com.greenpepper.util.CollectionUtil;
import com.greenpepper.util.IOUtil;
import com.greenpepper.util.log.GreenPepperLogger;

/**
 * 

HtmlDocumentBuilder class.

* * @author oaouattara * @version $Id: $Id */ public class HtmlDocumentBuilder implements DocumentBuilder { private static final Logger logger = GreenPepperLogger.getLogger(HtmlDocumentBuilder.class); private final List tags; private final List filters = new ArrayList(); private SpecificationDialect dialect; /** *

tables.

* * @return a {@link com.greenpepper.html.HtmlDocumentBuilder} object. */ public static HtmlDocumentBuilder tables() { return new HtmlDocumentBuilder( "table", "tr", "td th" ); } /** *

tablesAndLists.

* * @return a {@link com.greenpepper.html.HtmlDocumentBuilder} object. */ public static HtmlDocumentBuilder tablesAndLists() { return new HtmlDocumentBuilder( "table ul ol", "table>tr ol>li ul>li", "tr>td tr>th li>span li>b li>i li>u li>em" ).addFilter( new BulletListFilter() ); } /** *

Constructor for HtmlDocumentBuilder.

* * @param tags a {@link java.lang.String} object. */ public HtmlDocumentBuilder( String... tags ) { this( Arrays.asList( tags ) ); } /** *

Constructor for HtmlDocumentBuilder.

* * @param tags a {@link java.util.List} object. */ public HtmlDocumentBuilder( List tags ) { if (tags.isEmpty()) throw new IllegalArgumentException( "Specify at least a tag" ); this.tags = new ArrayList( tags ); } /** {@inheritDoc} */ public Document build( Reader reader ) throws IOException { String content = IOUtil.readContent(reader); String html = content; if (dialect != null) { html = dialect.convert(content); } html = injectCss(html); Example example = parse( html ); if (example == null) example = new TextExample( html ); return Document.html( example, name( html ), externalLink( html ) ); } /** *

parse.

* * @param html a {@link java.lang.String} object. * @return a {@link com.greenpepper.Example} object. */ public Example parse( String html ) { String text = removeComments( html ); return doParse(text); } @Override public DocumentBuilder withDialect(SpecificationDialect dialect) { this.dialect = dialect; return this; } private String injectCss(String html) { Pattern pattern = Pattern.compile("(.*)(]*>)(.*)",Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL); Matcher bodyMatcher = pattern.matcher(html); if (bodyMatcher.matches()) { StringBuilder cssStyle = new StringBuilder(bodyMatcher.group(2)); cssStyle.append(""); String modified = bodyMatcher.group(1) + cssStyle + bodyMatcher.group(3); logger.trace("Html with Injected CSS\n {}", modified); return modified; } catch (IOException e) { throw new RuntimeException("Couldn't inject the report css in the document ",e); } } else { return html; } } private Example doParse(String text) { if (pathologicalCase(text)) return null; Matcher match = match( text ); if (!match.find()) return null; Example child = hasChild( match ) ? childParser( match ).doParse( content( match ) ) : null; Example sibling = hasSibling( match ) ? doParse( tail( match ) ) : null; return new HtmlExample( lead( match ), startTag( match ), tag( match ), content( match ), endTag( match ), tail( match ), childTags( match ), child, sibling ); } /** * Most regex implementations today do not build a DFA / NFA -- * especially those that offer backreferences (which are not "regular" at all). * * And because they do NOT build DFAs and NFAs, it's very simple to construct pathological cases * - e.g., "((a*)(a*))+b" can take exponentially long to decide that * aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaac is not in the language * using the matching techniques commonly in use. * * Ori Berger */ private boolean pathologicalCase(String text) { String tag = toRegex( CollectionUtil.first( tags )); String regex = String.format( "(?is)(<\\s*(%s)\\s*.*?>)", tag ); Matcher match = compile(regex).matcher(text); if (match.find()) { regex = String.format( "(?is)(.*?)(<\\s*/\\s*(%s)\\s*.*?>)", tag ); return !compile(regex).matcher(text).find(match.end()); } return true; } private Matcher match( String text ) { return compile( firstTag( tags ) ).matcher( text ); } private boolean hasSibling( Matcher match ) { return tail( match ).matches( firstTag( tags ) ); } private HtmlDocumentBuilder childParser( Matcher match ) { return new HtmlDocumentBuilder( childTags( match ) ).addAllFilters( filters ); } private boolean hasChild( Matcher match ) { return !childTags( match ).isEmpty() && content( match ).matches( firstTag( childTags( match ) ) ); } private List childTags( Matcher matcher ) { List moreTags = new ArrayList(); for (int i = 1; i < tags.size(); i++) { moreTags.add( stripSelector( tag( matcher ), tags.get( i ) ) ); } return moreTags; } private String firstTag( List tags ) { return elementPattern( toRegex( CollectionUtil.first( tags ) ) ); } private String tail( Matcher matcher ) { return matcher.group( 6 ); } private String endTag( Matcher matcher ) { return matcher.group( 5 ); } private String content( Matcher matcher ) { String content = matcher.group( 4 ); for (int i = filters.size() - 1; i >= 0; i--) { HtmlContentFilter filter = filters.get( i ); if (filter.handles( tag( matcher ) )) return filter.process( content ); } return content; } private String tag( Matcher matcher ) { return matcher.group( 3 ).toLowerCase(); } private String startTag( Matcher matcher ) { return matcher.group( 2 ); } private String lead( Matcher matcher ) { return matcher.group( 1 ); } private Pattern compile( String regex ) { return Pattern.compile( regex ); } private String removeComments( String html ) { return html.replaceAll( "(?is)", "" ); } private String stripSelector( String selector, String tag ) { return tag.replaceAll( selector + ">", "" ); } private String elementPattern( String t ) { return String.format( "(?is)(.*?)(<\\s*(%s)\\s*.*?>)(.*?)(<\\s*/\\s*\\3\\s*>)(.*)", t ); } private String toRegex( String tags ) { return tags.replaceAll( " ", "|" ); } private String name( String content ) { return meta( content, "title" ); } private String externalLink( String content ) { return meta( content, "external-link" ); } private String meta( String content, String name ) { Matcher matcher = compile(String.format( "" , name )).matcher( content ); return matcher.find() ? matcher.group( 1 ) : null; } /** *

useTags.

* * @param tags a {@link java.lang.String} object. * @return a {@link com.greenpepper.html.HtmlDocumentBuilder} object. */ public HtmlDocumentBuilder useTags( String... tags ) { this.tags.clear(); this.tags.addAll( Arrays.asList( tags ) ); return this; } /** *

addAllFilters.

* * @param filters a {@link java.util.List} object. * @return a {@link com.greenpepper.html.HtmlDocumentBuilder} object. */ public HtmlDocumentBuilder addAllFilters( List filters ) { this.filters.addAll( filters ); return this; } /** *

addFilter.

* * @param filter a {@link com.greenpepper.html.HtmlContentFilter} object. * @return a {@link com.greenpepper.html.HtmlDocumentBuilder} object. */ public HtmlDocumentBuilder addFilter( HtmlContentFilter... filter ) { return addAllFilters( Arrays.asList( filter ) ); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy