org.tinymediamanager.scraper.SearchTitleWithGoogle Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of api-scraper Show documentation
Show all versions of api-scraper Show documentation
API for tinyMediaManager scrapers
package org.tinymediamanager.scraper;
import java.io.InputStream;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tinymediamanager.scraper.http.InMemoryCachedUrl;
import org.tinymediamanager.scraper.http.Url;
public class SearchTitleWithGoogle {
private static final Logger LOGGER = LoggerFactory.getLogger(SearchTitleWithGoogle.class);
private static final String PAGE_ENCODING = "UTF-8";
/**
* Does a fallback search with google, returning the first 10 results...
*
* You have to
* 1) check, if url starts with your desired destination page (aka filter results)
* 2) get the ID from url (if you work with it)
*
* @param site
* the base hostname like "zelluloid.de"
* @param mpi
* @param options
* @return MediaSearchResult, but NO id filled. Scraper MUST work with url-only!
*/
public List search(String site, MediaProviderInfo mpi, MediaSearchOptions options) {
LOGGER.debug("SearchTitleWithGoogle() - {}", options);
List resultList = new ArrayList<>();
String searchUrl = "";
String searchTerm = "";
try {
if (StringUtils.isNotEmpty(options.getQuery())) {
if (!site.startsWith("http")) {
site = "http://" + site;
}
site = new URL(site).getHost();
searchTerm = options.getQuery();
String lang = options.getLanguage().getLanguage();
searchUrl = "https://www.google." + lang + "/search?q=" + URLEncoder.encode("site:" + site + " " + searchTerm, PAGE_ENCODING);
LOGGER.debug("search for: {} ({})", searchTerm, searchUrl);
}
else {
LOGGER.debug("empty searchString");
return resultList;
}
}
catch (Exception e) {
LOGGER.warn("error searching {}", e.getMessage());
return resultList;
}
Document doc = null;
try {
Url url = new InMemoryCachedUrl(searchUrl);
InputStream in = url.getInputStream();
doc = Jsoup.parse(in, PAGE_ENCODING, "");
in.close();
if (doc == null) {
return resultList;
}
Elements res = doc.getElementsByClass("r");
for (Element el : res) {
Element a = el.getElementsByTag("a").first();
MediaSearchResult sr = new MediaSearchResult(mpi.getId(), options.getMediaType());
String gurl = a.attr("href");
if (gurl.contains("url?q=")) {
// google manipulated tracking url
URL tmp = new URL("http://google.com/" + gurl);
String[] params = tmp.getQuery().split("[\\?&]");
for (String param : params) {
String name = param.split("=")[0];
String value = param.split("=")[1];
if (name.equals("q")) {
gurl = value;
}
}
}
sr.setUrl(URLDecoder.decode(gurl, PAGE_ENCODING));
// sr.setId(mpi.getId()); // we have no clue about ID!!
sr.setTitle(a.text().replaceAll(site, "(via Google)"));
resultList.add(sr);
}
}
catch (Exception e) {
LOGGER.error("failed to search for {} - {}", searchTerm, e.getMessage());
}
return resultList;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy