All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.tinymediamanager.scraper.SearchTitleWithGoogle Maven / Gradle / Ivy

There is a newer version: 3.0.5
Show newest version
package org.tinymediamanager.scraper;

import java.io.InputStream;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tinymediamanager.scraper.http.InMemoryCachedUrl;
import org.tinymediamanager.scraper.http.Url;

public class SearchTitleWithGoogle {
  private static final Logger LOGGER        = LoggerFactory.getLogger(SearchTitleWithGoogle.class);
  private static final String PAGE_ENCODING = "UTF-8";

  /**
   * Does a fallback search with google, returning the first 10 results...
*
* You have to
* 1) check, if url starts with your desired destination page (aka filter results)
* 2) get the ID from url (if you work with it) * * @param site * the base hostname like "zelluloid.de" * @param mpi * @param options * @return MediaSearchResult, but NO id filled. Scraper MUST work with url-only! */ public List search(String site, MediaProviderInfo mpi, MediaSearchOptions options) { LOGGER.debug("SearchTitleWithGoogle() - {}", options); List resultList = new ArrayList<>(); String searchUrl = ""; String searchTerm = ""; try { if (StringUtils.isNotEmpty(options.getQuery())) { if (!site.startsWith("http")) { site = "http://" + site; } site = new URL(site).getHost(); searchTerm = options.getQuery(); String lang = options.getLanguage().getLanguage(); searchUrl = "https://www.google." + lang + "/search?q=" + URLEncoder.encode("site:" + site + " " + searchTerm, PAGE_ENCODING); LOGGER.debug("search for: {} ({})", searchTerm, searchUrl); } else { LOGGER.debug("empty searchString"); return resultList; } } catch (Exception e) { LOGGER.warn("error searching {}", e.getMessage()); return resultList; } Document doc = null; try { Url url = new InMemoryCachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); if (doc == null) { return resultList; } Elements res = doc.getElementsByClass("r"); for (Element el : res) { Element a = el.getElementsByTag("a").first(); MediaSearchResult sr = new MediaSearchResult(mpi.getId(), options.getMediaType()); String gurl = a.attr("href"); if (gurl.contains("url?q=")) { // google manipulated tracking url URL tmp = new URL("http://google.com/" + gurl); String[] params = tmp.getQuery().split("[\\?&]"); for (String param : params) { String name = param.split("=")[0]; String value = param.split("=")[1]; if (name.equals("q")) { gurl = value; } } } sr.setUrl(URLDecoder.decode(gurl, PAGE_ENCODING)); // sr.setId(mpi.getId()); // we have no clue about ID!! sr.setTitle(a.text().replaceAll(site, "(via Google)")); resultList.add(sr); } } catch (Exception e) { LOGGER.error("failed to search for {} - {}", searchTerm, e.getMessage()); } return resultList; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy