org.carrot2.source.opensearch.RomeFetcherUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of carrot2-core Show documentation
Carrot2 search results clustering framework core, document sources and clustering algorithms.
There is a newer version: 4.6.0

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.source.opensearch;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.StringEscapeUtils;
import org.carrot2.core.Document;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.util.StringUtils;

import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.fetcher.FeedFetcher;
import com.sun.syndication.fetcher.FetcherException;
import com.sun.syndication.io.FeedException;

/**
 * Utility methods for working with Rome fetcher.
 */
public class RomeFetcherUtils
{
    /**
     * Fetches an OpenSearch feed from the provided URL and returns the entries as Carrot2
     * {@link SearchEngineResponse}.
     * 
     * @param url the OpenSearch feed to fetch
     * @param feedFetcher Rome fetcher to use
     * @return {@link SearchEngineResponse} containing entries from the feed
     */
    @SuppressWarnings("rawtypes")
    public static SearchEngineResponse fetchUrl(final String url, FeedFetcher feedFetcher)
        throws IOException, FeedException, FetcherException, MalformedURLException
    {
        /*
         * TODO: Rome fetcher uses SUN's HttpClient and opens a persistent HTTP connection
         * (background thread that keeps reference to the class loader). This causes minor
         * memory leaks when reloading Web applications. Consider: 1) patching rome
         * fetcher sources and adding Connection: close to request headers, 2) using
         * Apache HttpClient, 3) using manual fetch of the syndication feed.
         */
        final SyndFeed feed = feedFetcher.retrieveFeed(new URL(url));
        final SearchEngineResponse response = new SearchEngineResponse();

        // The documentation does not mention that null value can be returned
        // but we've seen a NPE here:
        // http://builds.carrot2.org/browse/C2HEAD-SOURCES-4.
        if (feed != null)
        {
            final List entries = feed.getEntries();
            for (Iterator it = entries.iterator(); it.hasNext();)
            {
                final SyndEntry entry = (SyndEntry) it.next();
                final Document document = new Document();

                document.setField(Document.TITLE, clean(entry.getTitle()));
                document.setField(Document.SUMMARY, clean(entry.getDescription()
                    .getValue()));
                document.setField(Document.CONTENT_URL, entry.getLink());

                response.results.add(document);
            }
        }

        return response;
    }

    private static String clean(String string)
    {
        return StringUtils.removeHtmlTags(StringEscapeUtils.unescapeHtml(string));
    }
}