All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ibm.watson.developer_cloud.cognitive_client.Search Maven / Gradle / Ivy

The newest version!
/**
 * 
 */
package com.ibm.watson.developer_cloud.cognitive_client;

import java.util.ArrayList;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;





import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;



/**
 * @author ArunIyengar
 * This class implements Web searching
 *
 */
public class Search {
    
    public enum SearchType {
        GOOGLE_REGULAR,
        GOOGLE_NEWS,
/*        BING_REGULAR,
        BING_NEWS,
        YAHOO
*/
    }

    private static final String GOOGLE_BASE_URL = "https://www.google.com/search?";
    private static final String QUERY_PREFIX = "q=";
    private static final String GOOGLE_SEARCH_URL = GOOGLE_BASE_URL + QUERY_PREFIX;
    private static final String NEWS = "hl=en&gl=us&tbm=nws&";
    private static final String GOOGLE_NEWS_URL = GOOGLE_BASE_URL + NEWS + QUERY_PREFIX;
    
/*    private static final String BING_BASE_URL = "https://www.bing.com/";
    private static final String BING_QUERY_PREFIX = "search?q=";
    private static final String BING_SEARCH_URL = BING_BASE_URL + BING_QUERY_PREFIX;
    private static final String BING_NEWS = "news/";
    private static final String BING_NEWS_URL = BING_BASE_URL + BING_NEWS + BING_QUERY_PREFIX;
    private static final String YAHOO_SEARCH_URL = "https://search.yahoo.com/search?p=";
*/
    
    
    /**
     * Perform a Google search on a query and return an ArrayList of URLs found. 
     * 
     * @param query
     *            query to pass to search engine
     * @param numResults
     *            number of documents to search for
     * @param searchType
     *            Type of search
     * @param verbose
     *            true to print out information for debugging purposes
     * 
     * @return ArrayList of urls found
     * 
     * */
    public static ArrayList search(String query, int numResults, SearchType searchType, boolean verbose) {
        String encodedQuery;
        ArrayList urls = new ArrayList();
        try {
            encodedQuery = URLEncoder.encode(query, "UTF-8") + "&num=" + numResults;
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
            return urls;
        }
        String searchURL = searchPrefixString(searchType) + encodedQuery;
        if (verbose) {
            System.out.println(searchURL);
        }
        Document doc;
        try {
            // Without proper User-Agent, we will get 403 error
            doc = Jsoup.connect(searchURL).userAgent("Mozilla/5.0").get();
        } catch (IOException e) {
            e.printStackTrace();
            return urls;
        }
                
        // If google search results HTML change the 

a"); for (Element result : results) { String linkHref = result.attr("href"); String linkText = result.text(); String url = result.absUrl("href"); // Google returns URLs in format "http://www.google.com/url?q=&sa=U&ei=". try { url = URLDecoder.decode(url.substring(url.indexOf('=') + 1, url.indexOf('&')), "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); continue; } if (!url.startsWith("http")) { continue; // Ads/news/etc. } urls.add(url); if (verbose) { System.out.println("Text::" + linkText + ", URL (undecoded): " + linkHref.substring(6, linkHref.indexOf("&"))); System.out.println("URL (decoded): " + url); } } System.out.println("Search.search: " + urls.size() + " urls found"); return urls; } /** * Return a string corresponding to the type of search. * * @param searchType * Type of search * * @return String corresponding to the type of search * * */ public static String searchPrefixString(SearchType searchType) { switch(searchType) { case GOOGLE_REGULAR: return GOOGLE_SEARCH_URL; default: return GOOGLE_NEWS_URL; /* case BING_REGULAR: return BING_SEARCH_URL; case BING_NEWS: return BING_NEWS_URL; default: return YAHOO_SEARCH_URL; */ } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy