All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.source.opensearch.OpenSearchDocumentSource Maven / Gradle / Ivy

Go to download

Carrot2 search results clustering framework core, document sources and clustering algorithms.

There is a newer version: 4.6.0
Show newest version

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.opensearch;

import java.util.Map;
import java.util.concurrent.Callable;

import org.carrot2.core.Document;
import org.carrot2.core.IDocumentSource;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
import org.carrot2.source.MultipageSearchEngine;
import org.carrot2.source.MultipageSearchEngineMetadata;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.util.StringUtils;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Required;
import org.carrot2.util.attribute.constraint.IntRange;
import org.carrot2.util.resource.URLResourceWithParams;
import org.slf4j.Logger;

import com.google.common.collect.Maps;
import com.sun.syndication.fetcher.FeedFetcher;
import com.sun.syndication.fetcher.impl.HttpURLFeedFetcher;

/**
 * A {@link IDocumentSource} fetching {@link Document}s (search results) from an
 * OpenSearch feed.
 * 

* Based on code donated by Julien Nioche. * * @see OpenSearch.org */ @Bindable(prefix = "OpenSearchDocumentSource") public class OpenSearchDocumentSource extends MultipageSearchEngine { /** Logger for this class. */ final static Logger logger = org.slf4j.LoggerFactory.getLogger(OpenSearchDocumentSource.class); /** * Maximum concurrent threads from all instances of this component. */ private static final int MAX_CONCURRENT_THREADS = 10; /** * URL to fetch the search feed from. The URL template can contain variable place * holders as defined by the OpenSearch specification that will be replaced during * runtime. The format of the place holder is ${variable}. The following * variables are supported: *

    *
  • searchTerms will be replaced by the query
  • *
  • startIndex index of the first result to be searched. Mutually * exclusive with startPage
  • *
  • startPage index of the first result * to be searched. Mutually exclusive with startIndex.
  • *
  • count the number of search results per page
  • *
* *

Example URL feed templates for public services:

*
*
nature.com
*
http://www.nature.com/opensearch/request?interface=opensearch&operation=searchRetrieve&query=${searchTerms}&startRecord=${startIndex}&maximumRecords=${count}&httpAccept=application/rss%2Bxml
*
indeed.com
*
http://www.indeed.com/opensearch?q=${searchTerms}&start=${startIndex}&limit=${count}
*
*/ @Input @Processing @Init @Attribute @Required @Label("Feed URL template") @Level(AttributeLevel.BASIC) @Group(SERVICE) public String feedUrlTemplate; /** * Results per page. The number of results per page the document source will expect * the feed to return. */ @Input @Processing @Init @Attribute @Required @IntRange(min = 1) @Label("Results per page") @Level(AttributeLevel.BASIC) @Group(SERVICE) public int resultsPerPage = 50; /** * Maximum number of results. The maximum number of results the document source can * deliver. */ @Input @Processing @Init @Attribute @IntRange(min = 1) @Label("Maximum results") @Level(AttributeLevel.BASIC) @Group(SERVICE) public int maximumResults = 1000; /** * Additional parameters to be appended to {@link #feedUrlTemplate} on each request. */ @Input @Init @Processing @Attribute @Label("Feed URL parameters") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public Map feedUrlParams = null; /** * User agent header. The contents of the User-Agent HTTP header to use when making * requests to the feed URL. If empty or null value is provided, * the following User-Agent will be sent: Rome Client (http://tinyurl.com/64t5n) * Ver: UNKNOWN. */ @Input @Init @Processing @Attribute @Label("User agent") @Level(AttributeLevel.ADVANCED) @Group(SERVICE) public String userAgent = null; /** * Search engine metadata create upon initialization. */ private MultipageSearchEngineMetadata metadata; /** Fetcher for OpenSearch feed. */ private FeedFetcher feedFetcher; /** searchTerms variable */ private static final String SEARCH_TERMS_VARIABLE_NAME = "searchTerms"; /** startIndex variable */ private static final String START_INDEX_VARIABLE_NAME = "startIndex"; /** startPage variable */ private static final String START_PAGE_VARIABLE_NAME = "startPage"; /** count variable */ private static final String COUNT_VARIABLE_NAME = "count"; @Override public void beforeProcessing() { // Verify that the attributes are legal final boolean hasStartPage = URLResourceWithParams.containsAttributePlaceholder( feedUrlTemplate, START_PAGE_VARIABLE_NAME); final boolean hasStartIndex = URLResourceWithParams.containsAttributePlaceholder( feedUrlTemplate, START_INDEX_VARIABLE_NAME); if (!(hasStartPage ^ hasStartIndex)) { throw new ProcessingException( "The feedUrlTemplate must contain either " + URLResourceWithParams .formatAttributePlaceholder(START_INDEX_VARIABLE_NAME) + " or " + URLResourceWithParams .formatAttributePlaceholder(START_PAGE_VARIABLE_NAME) + " variable"); } if (!URLResourceWithParams.containsAttributePlaceholder(feedUrlTemplate, SEARCH_TERMS_VARIABLE_NAME)) { throw new ProcessingException( "The feedUrlTemplate must contain " + URLResourceWithParams .formatAttributePlaceholder(SEARCH_TERMS_VARIABLE_NAME) + " variable"); } if (resultsPerPage == 0) { throw new ProcessingException("resultsPerPage must be set"); } this.metadata = new MultipageSearchEngineMetadata(resultsPerPage, maximumResults, hasStartPage); this.feedFetcher = new HttpURLFeedFetcher(); if (org.apache.commons.lang.StringUtils.isNotBlank(this.userAgent)) { this.feedFetcher.setUserAgent(this.userAgent); } } @Override public void process() throws ProcessingException { super.process(metadata, getSharedExecutor(MAX_CONCURRENT_THREADS, this.getClass())); } @Override protected Callable createFetcher(final SearchRange bucket) { return new SearchEngineResponseCallable() { public SearchEngineResponse search() throws Exception { // Replace variables in the URL final Map values = Maps.newHashMap(); values.put(SEARCH_TERMS_VARIABLE_NAME, query); values.put(START_INDEX_VARIABLE_NAME, bucket.start + 1); values.put(START_PAGE_VARIABLE_NAME, bucket.start + 1); values.put(COUNT_VARIABLE_NAME, bucket.results); final StringBuilder urlExtension = new StringBuilder( URLResourceWithParams.substituteAttributes(feedUrlTemplate, values)); if (feedUrlParams != null) { for (Map.Entry entry : feedUrlParams.entrySet()) { urlExtension.append('&'); urlExtension.append(entry.getKey()); urlExtension.append('='); urlExtension.append(StringUtils.urlEncodeWrapException(entry .getValue(), "UTF-8")); } } final String url = urlExtension.toString(); logger.debug("Fetching URL: " + url); return RomeFetcherUtils.fetchUrl(url, feedFetcher); } }; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy