All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cloudburo.grab.webcontent.Grabber Maven / Gradle / Ivy

The newest version!
package com.cloudburo.grab.webcontent;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

import org.xml.sax.SAXException;

import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.CommonExtractors;
import de.l3s.boilerpipe.sax.HTMLHighlighter;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class Grabber {
	
	// choose the operation mode (i.e., highlighting or extraction)
//	final HTMLHighlighter hh = HTMLHighlighter.newHighlightingInstance();
	final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
	
	static final Logger logger = LoggerFactory.getLogger(Grabber.class);	

	public Grabber() {}
	
	// Exception in thread "main" java.net.ConnectException: Operation timed out
	public GrabberRecord extractArticle(String inurl, boolean textOnly) throws IOException, BoilerpipeProcessingException, SAXException  {
		URL url = new URL(resolveGoogleRedirect(inurl));
		String content =  hh.process(url, CommonExtractors.ARTICLE_EXTRACTOR);
		if (textOnly) {
			Document doc = Jsoup.parse(content);
			content = doc.select("BODY").first().text().toString();
		}
		GrabberRecord rec = new GrabberRecord();
		rec.url = url;
		rec.content = content;
		return rec;
	}
	
	public String extractDefault(String inurl) throws IOException, BoilerpipeProcessingException, SAXException  {
		URL url = new URL(resolveGoogleRedirect(inurl));
		return hh.process(url, CommonExtractors.DEFAULT_EXTRACTOR);
	}
	
	public String extractCanloa(String inurl) throws IOException, BoilerpipeProcessingException, SAXException  {
		URL url = new URL(resolveGoogleRedirect(inurl));
		return hh.process(url, CommonExtractors.CANOLA_EXTRACTOR);
	}
	
	public String extractLargestContent(String inurl) throws IOException, BoilerpipeProcessingException, SAXException  {
		URL url = new URL(resolveGoogleRedirect(inurl));		
		return hh.process(url, CommonExtractors.LARGEST_CONTENT_EXTRACTOR);
	}
	
	private String resolveGoogleRedirect(String url) throws MalformedURLException {
		// https://www.google.com/url?rct=j&sa=t&url=http://www.weser-kurier.de/region/osterholz_artikel,-Auf-den-Hund-gekommen-_arid,957450.html&ct=ga&cd=CAIyGmFhMDMyNWRiMmZmMWQxNzc6Y29tOmRlOlVT&usg=AFQjCNERP6ZxevD2QPzgT9hVu_sFi97WEA	
		if (url.contains("www.google.com/url")) {	
			url =  getQueryMap(new URL(url)).get("url");
		}
		logger.debug("Fetching with URL: "+url);
		return url;
	}
	
	private  Map getQueryMap(URL url)  
	{  
	    String[] params = url.getQuery().split("&");  
	    Map map = new HashMap();  
	    for (String param : params)  
	    {  
	        String name = param.split("=")[0];  
	        String value = param.split("=")[1];  
	        map.put(name, value);  
	    }  
	    return map;  
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy