All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.source.MultipageSearchEngine Maven / Gradle / Ivy

There is a newer version: 4.6.0
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2018, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;

import org.carrot2.core.*;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;

import org.carrot2.shaded.guava.common.base.Predicate;

/**
 * A base class facilitating implementation of {@link IDocumentSource}s wrapping external
 * search engines with remote/ network-based interfaces. This class implements helper
 * methods for concurrent querying of search services that limit the number of search
 * results returned in one request.
 * 
 * @see SimpleSearchEngine
 */
@Bindable
public abstract class MultipageSearchEngine extends SearchEngineBase
{
    /**
     * Search mode defines how fetchers returned from {@link #createFetcher}
     * are called.
     * @see SearchMode
     */
    @Processing
    @Input
    @Attribute(key = "search-mode")
    @Level(AttributeLevel.ADVANCED)
    @Label("Search Mode")
    @Group(DefaultGroups.SOURCE_PAGING)
    public SearchMode searchMode = SearchMode.SPECULATIVE;

    /**
     * Run a request the search engine's API, setting documents to the set of
     * returned documents.
     */
    protected void process(MultipageSearchEngineMetadata metadata,
        ExecutorService executor) throws ProcessingException
    {
        final SearchEngineResponse [] responses = runQuery(query, start, results,
            metadata, executor);

        compressed = false;
        if (responses.length > 0)
        {
            // Collect documents from the responses.
            documents = new ArrayList(Math
                .min(results, metadata.maxResultIndex));
            collectDocuments(documents, responses);

            // Filter out duplicated URLs.
            final Iterator i = documents.iterator();
            final Predicate p = new UniqueFieldPredicate(Document.CONTENT_URL);
            while (i.hasNext())
            {
                if (!p.apply(i.next()))
                {
                    i.remove();
                }
            }

            resultsTotal = responses[0].getResultsTotal();

            for (int j = 0; j < responses.length; j++)
            {
                final String compression = (String) responses[j].metadata
                    .get(SearchEngineResponse.COMPRESSION_KEY);
                if (compression != null && "gzip".contains(compression))
                {
                    compressed = true;
                }
            }
        }
        else
        {
            documents = Collections. emptyList();
            resultsTotal = 0;
        }
    }

    /**
     * Subclasses should override this method and return a {@link Callable} instance that
     * fetches search results in the given range.
     * 

* Note the query (if any is required) should be passed at the concrete class level. * We are not concerned with it here. * * @param bucket The search range to fetch. */ protected abstract Callable createFetcher( final SearchRange bucket); /** * Collects documents from an array of search engine's responses. */ protected final void collectDocuments(Collection collector, SearchEngineResponse [] responses) { for (final SearchEngineResponse response : responses) { collector.addAll(response.results); } } /** * This method implements the logic of querying a typical search engine. If the number * of requested results is higher than the number of results on one response page, * then multiple (possibly concurrent) requests are issued via the provided * {@link ExecutorService}. */ protected final SearchEngineResponse [] runQuery(final String query, final int start, final int results, MultipageSearchEngineMetadata metadata, final ExecutorService executor) throws ProcessingException { this.statistics.incrQueryCount(); // Split the requested range into pages. SearchRange [] buckets = SearchRange.getSearchRanges(start, results, metadata.maxResultIndex, metadata.resultsPerPage, metadata.incrementByPage); // Check preconditions. if (query == null || query.trim().equals("") || buckets.length == 0) { return new SearchEngineResponse [0]; } try { // Initialize output documents array. final ArrayList responses = new ArrayList( buckets.length); // If in conservative mode, run the first request to estimate the // number of needed results. if (buckets.length == 1 || searchMode == SearchMode.CONSERVATIVE) { final SearchEngineResponse response = createFetcher(buckets[0]).call(); final long resultsTotal = response.getResultsTotal(); responses.add(response); if (buckets.length == 1) { // If there was just one bucket, there is no need to go further on. return responses.toArray(new SearchEngineResponse [responses.size()]); } else { // We do have an estimate of results now, modify it // and recalculate the buckets. if (resultsTotal != -1 && resultsTotal < results) { buckets = SearchRange.getSearchRanges(buckets[0].results, (int) resultsTotal, metadata.maxResultIndex, metadata.resultsPerPage, metadata.incrementByPage); } } } // Run concurrent requests using the executor. final ArrayList> fetchers = new ArrayList>( buckets.length); for (final SearchRange r : buckets) { fetchers.add(createFetcher(r)); } // Run requests in parallel. final List> futures = executor .invokeAll(fetchers); // Collect results. for (final Future future : futures) { if (!future.isCancelled()) { responses.add(future.get()); } } return responses.toArray(new SearchEngineResponse [responses.size()]); } catch (final IOException e) { throw new ProcessingException(e.getMessage(), e); } catch (final InterruptedException e) { // If interrupted, return with no error. return new SearchEngineResponse [0]; } catch (final Exception e) { Throwable cause = e.getCause(); if (cause == null) { cause = e; } throw new ProcessingException(cause.getMessage(), e); } } /** * An implementation of {@link Callable} that increments page request count statistics * before the actual search is made. */ protected abstract class SearchEngineResponseCallable implements Callable { public final SearchEngineResponse call() throws Exception { statistics.incrPageRequestCount(); final SearchEngineResponse response = search(); afterFetch(response); return response; } /** * Performs the actual search and returns the response. */ public abstract SearchEngineResponse search() throws Exception; } /** * A single result window to fetch. */ protected final static class SearchRange { /** Empty range. */ private static final SearchRange [] EMPTY_RANGE = new SearchRange [0]; /** Start index from which to search (inclusive). */ public final int start; /** How many results to fetch. */ public final int results; /** * Create a new search range. * * @param start Start index of the first result to return (0-based). * @param results The number of results to return. The actual number of results * returned by a search service may be lower than this number. */ public SearchRange(int start, int results) { this.start = start; this.results = results; } /** * Given an unconstrained start and results count, adjust it to the allowed window * and split into page buckets if necessary. */ public static SearchRange [] getSearchRanges(int start, int results, int maxIndex, int resultsPerPage, boolean incrementByPage) { // Sanity check. results = Math.max(results, 0); start = Math.max(start, 0); int startIndex = Math.min(start * (incrementByPage ? resultsPerPage : 1), maxIndex); final int endIndex = Math.min(start * (incrementByPage ? resultsPerPage : 1) + results, maxIndex); final int resultsNeeded = endIndex - startIndex; if (resultsNeeded == 0) { return EMPTY_RANGE; } final int lastBucketSize = resultsNeeded % resultsPerPage; final int bucketsNeeded = resultsNeeded / resultsPerPage + (lastBucketSize > 0 ? 1 : 0); final SearchRange [] buckets = new SearchRange [bucketsNeeded]; for (int i = 0; i < buckets.length; i++) { final int window = Math.min(resultsPerPage, endIndex - startIndex); buckets[i] = new SearchRange((incrementByPage ? start + i : startIndex), window); startIndex += window; } return buckets; } } /** * Search mode for data source components that implement parallel request to some * search service. */ public enum SearchMode { /** * In this mode, an initial search request is performed to estimate the number of * documents available on the server. Then the requested number of documents is * adjusted according to the number of documents available to minimize the number * of requests. */ CONSERVATIVE, /** * In this mode, the number of requested documents is divided by the maximum * number of documents the search engine can return in a single request. The * result is the number of concurrent requests launched to the search * service. *

* Note that speculative threads cause larger load on the search service and will * exhaust your request pool quicker (if it is limited). */ SPECULATIVE, } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy