com.cloudburo.grab.webcontent.Grabber Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of webcontent-grabber Show documentation
Show all versions of webcontent-grabber Show documentation
A java client library to grab the webcontent
The newest version!
package com.cloudburo.grab.webcontent;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import org.xml.sax.SAXException;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.CommonExtractors;
import de.l3s.boilerpipe.sax.HTMLHighlighter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class Grabber {
// choose the operation mode (i.e., highlighting or extraction)
// final HTMLHighlighter hh = HTMLHighlighter.newHighlightingInstance();
final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
static final Logger logger = LoggerFactory.getLogger(Grabber.class);
public Grabber() {}
// Exception in thread "main" java.net.ConnectException: Operation timed out
public GrabberRecord extractArticle(String inurl, boolean textOnly) throws IOException, BoilerpipeProcessingException, SAXException {
URL url = new URL(resolveGoogleRedirect(inurl));
String content = hh.process(url, CommonExtractors.ARTICLE_EXTRACTOR);
if (textOnly) {
Document doc = Jsoup.parse(content);
content = doc.select("BODY").first().text().toString();
}
GrabberRecord rec = new GrabberRecord();
rec.url = url;
rec.content = content;
return rec;
}
public String extractDefault(String inurl) throws IOException, BoilerpipeProcessingException, SAXException {
URL url = new URL(resolveGoogleRedirect(inurl));
return hh.process(url, CommonExtractors.DEFAULT_EXTRACTOR);
}
public String extractCanloa(String inurl) throws IOException, BoilerpipeProcessingException, SAXException {
URL url = new URL(resolveGoogleRedirect(inurl));
return hh.process(url, CommonExtractors.CANOLA_EXTRACTOR);
}
public String extractLargestContent(String inurl) throws IOException, BoilerpipeProcessingException, SAXException {
URL url = new URL(resolveGoogleRedirect(inurl));
return hh.process(url, CommonExtractors.LARGEST_CONTENT_EXTRACTOR);
}
private String resolveGoogleRedirect(String url) throws MalformedURLException {
// https://www.google.com/url?rct=j&sa=t&url=http://www.weser-kurier.de/region/osterholz_artikel,-Auf-den-Hund-gekommen-_arid,957450.html&ct=ga&cd=CAIyGmFhMDMyNWRiMmZmMWQxNzc6Y29tOmRlOlVT&usg=AFQjCNERP6ZxevD2QPzgT9hVu_sFi97WEA
if (url.contains("www.google.com/url")) {
url = getQueryMap(new URL(url)).get("url");
}
logger.debug("Fetching with URL: "+url);
return url;
}
private Map getQueryMap(URL url)
{
String[] params = url.getQuery().split("&");
Map map = new HashMap();
for (String param : params)
{
String name = param.split("=")[0];
String value = param.split("=")[1];
map.put(name, value);
}
return map;
}
}