All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.openimaj.web.layout.LayoutExtractor Maven / Gradle / Ivy

/**
 * Copyright (c) 2011, The University of Southampton and the individual contributors.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * 	Redistributions of source code must retain the above copyright notice,
 * 	this list of conditions and the following disclaimer.
 *
 *   *	Redistributions in binary form must reproduce the above copyright notice,
 * 	this list of conditions and the following disclaimer in the documentation
 * 	and/or other materials provided with the distribution.
 *
 *   *	Neither the name of the University of Southampton nor the names of its
 * 	contributors may be used to endorse or promote products derived from this
 * 	software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.openimaj.web.layout;

import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeoutException;

import org.apache.log4j.Logger;
import org.openimaj.image.MBFImage;
import org.openimaj.image.colour.ColourSpace;
import org.openimaj.image.renderer.MBFImageRenderer;
import org.openimaj.math.geometry.shape.Rectangle;
import org.openimaj.web.ProgrammaticBrowser;
import org.openimaj.web.readability.Readability;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import com.trolltech.qt.webkit.QWebElement;
import com.trolltech.qt.webkit.QWebElementCollection;

/**
 * Class for extracting information on the layout of DOM elements in
 * a web page.
 * 
 * @author Jonathon Hare ([email protected])
 *
 */
public class LayoutExtractor {
	private static final String GEN_ID = "__openimaj_gen_id_";

	private static final Logger logger = Logger.getLogger(LayoutExtractor.class);
	
	private ProgrammaticBrowser browser;
	
	private long timeout = 0;

	/**
	 * Default constructor
	 */
	public LayoutExtractor() {
		browser = new ProgrammaticBrowser();
	}
	
	/**
	 * Default constructor
	 * @param timeout 
	 */
	public LayoutExtractor(long timeout) {
		this();
		this.timeout = timeout;
	}

	/**
	 * Load a web page from a URL
	 * @param url the url
	 * @return true if successful; false otherwise
	 */
	public boolean load(String url) {
		boolean ret;
		try {
			ret = browser.load(url, timeout);
		} catch (TimeoutException e) {
			return false;
		}
				
		if (ret) augmentDOM();
		
		return ret;
	}
	
	/**
	 * Load a web page from a URL
	 * @param url the url
	 * @return true if successful; false otherwise
	 */
	public boolean load(URL url) {
		boolean ret;
		try {
			ret = browser.load(url, timeout);
		} catch (TimeoutException e) {
			return false;
		}
				
		if (ret) augmentDOM();
		
		return ret;
	}
	
	/**
	 * Load a web page from an HTML string
	 * @param html the HTML string
	 * @return true if successful; false otherwise
	 */
	public boolean loadHTML(String html) {
		boolean ret = browser.loadHTML(html);
				
		if (ret) augmentDOM();
		
		return ret;
	}
	
	private void augmentDOM() {
		QWebElement body = getBody();
		
		if (body == null) {
			logger.warn("body not found");
			return;
		}

		QWebElementCollection nl = body.findAll("*");
		for (int i=0; i getLayoutInfo() {
		List info = new ArrayList();
		
		Set contentIds = getContentIds();
		
		QWebElementCollection elements = browser.findAllElements("*");
		
		for (int i=0; i getContentIds() {
		Set ids = new HashSet();
		try {
			String html = browser.getHTML();
			
			Readability r = Readability.getReadability(html);

			Element d = (Element) r.getArticleHTML_DOM();
			if (d==null) return ids;
			NodeList nl = d.getElementsByTagName("*");

			for (int i=0; i content_areas = new ArrayList();
		List non_content_areas = new ArrayList();
		List non_content_areas_inside = new ArrayList();
		
		for (ElementInfo ei : getLayoutInfo()) {
			if (ei.isContent) {
				content_areas.add(ei.bounds);
			} else if (ei.isInsideContent) {
				non_content_areas_inside.add(ei.bounds);
			} else {
				non_content_areas.add(ei.bounds);
			}
		}

		MBFImageRenderer renderer = image.createRenderer();
		for (Rectangle r : content_areas) {
			renderer.drawShape(r, contentColour);
		}

		for (Rectangle r : non_content_areas_inside) {
			renderer.drawShape(r, nonContentInside);
		}
		
		for (Rectangle r : non_content_areas) {
			renderer.drawShape(r, nonContent);
		}

		return image;
	}
	
	/**
	 * Render the current page to an image
	 * @return an image of the current page, or null if there is no content
	 */
	public MBFImage render() {
		return browser.renderToImage();
	}
	
	/**
	 * Render the current page to an image of the given size or smaller
	 * @param maxwidth 
	 * @param maxheight 
	 * @return an image of the current page, or null if there is no content
	 */
	public MBFImage render(int maxwidth, int maxheight) {
		return browser.renderToImage(maxwidth, maxheight);
	}
	
	/**
	 * Run the browser for ms milliseconds. This
	 * allows it to update its content, etc.
	 * @param ms time to wait
	 */
	public void waitForBrowser(long ms) {
		browser.mainLoop(ms);
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy