
com.rometools.fetcher.impl.HttpURLFeedFetcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rome-fetcher Show documentation
Show all versions of rome-fetcher Show documentation
A well behaved feed fetcher API for ROME
/*
* Copyright 2004 Sun Microsystems, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.rometools.fetcher.impl;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.zip.GZIPInputStream;
import com.rometools.fetcher.FetcherEvent;
import com.rometools.fetcher.FetcherException;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import com.rometools.rome.io.XmlReader;
import com.rometools.utils.IO;
/**
*
* Class to retrieve syndication files via HTTP.
*
*
*
* If passed a {@link com.rometools.fetcher.impl.FeedFetcherCache} in the constructor it will use
* conditional gets to only retrieve modified content.
*
*
*
* The class uses the Accept-Encoding: gzip header to retrieve gzipped feeds where supported by the
* server.
*
*
*
* Simple usage:
*
*
* // create the cache
* FeedFetcherCache feedInfoCache = HashMapFeedInfoCache.getFeedInfoCache();
* // retrieve the feed the first time
* // any subsequent request will use conditional gets and only
* // retrieve the resource if it has changed
* SyndFeed feed = new HttpURLFeedFetcher(feedInfoCache).retrieveFeed(feedUrl);
*
*
*
*
* @see http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers
* @see http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level
* @see http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html
* @author Nick Lothian
*
* @deprecated ROME Fetcher will be dropped in the next major version of ROME (version 2). For more information and some migration hints,
* please have a look at our detailed explanation.
*/
@Deprecated
public class HttpURLFeedFetcher extends AbstractFeedFetcher {
private volatile int connectTimeout = -1;
static final int POLL_EVENT = 1;
static final int RETRIEVE_EVENT = 2;
static final int UNCHANGED_EVENT = 3;
private FeedFetcherCache feedInfoCache;
/**
* Constructor to use HttpURLFeedFetcher without caching of feeds
*
*/
public HttpURLFeedFetcher() {
this(null);
}
/**
* Constructor to enable HttpURLFeedFetcher to cache feeds
*
* @param feedInfoCache - an instance of the FeedFetcherCache interface
*/
public HttpURLFeedFetcher(final FeedFetcherCache feedInfoCache) {
setFeedInfoCache(feedInfoCache);
}
@Override
public SyndFeed retrieveFeed(final URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
return this.retrieveFeed(getUserAgent(), feedUrl);
}
/**
* Retrieve a feed over HTTP
*
* @param feedUrl A non-null URL of a RSS/Atom feed to retrieve
* @return A {@link com.rometools.rome.feed.synd.SyndFeed} object
* @throws IllegalArgumentException if the URL is null;
* @throws IOException if a TCP error occurs
* @throws FeedException if the feed is not valid
* @throws FetcherException if a HTTP error occurred
*/
@Override
public SyndFeed retrieveFeed(final String userAgent, final URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
if (feedUrl == null) {
throw new IllegalArgumentException("null is not a valid URL");
}
final URLConnection connection = feedUrl.openConnection();
if (!(connection instanceof HttpURLConnection)) {
throw new IllegalArgumentException(feedUrl.toExternalForm() + " is not a valid HTTP Url");
}
final HttpURLConnection httpConnection = (HttpURLConnection) connection;
if (connectTimeout >= 0) {
httpConnection.setConnectTimeout(connectTimeout);
}
// httpConnection.setInstanceFollowRedirects(true); // this is true by default, but can be
// changed on a claswide basis
final FeedFetcherCache cache = getFeedInfoCache();
if (cache != null) {
SyndFeedInfo syndFeedInfo = cache.getFeedInfo(feedUrl);
setRequestHeaders(connection, syndFeedInfo, userAgent);
httpConnection.connect();
try {
fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
if (syndFeedInfo == null) {
// this is a feed that hasn't been retrieved
syndFeedInfo = new SyndFeedInfo();
retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
} else {
// check the response code
final int responseCode = httpConnection.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_NOT_MODIFIED) {
// the response code is not 304 NOT MODIFIED
// This is either because the feed server
// does not support condition gets
// or because the feed hasn't changed
retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
} else {
// the feed does not need retrieving
fireEvent(FetcherEvent.EVENT_TYPE_FEED_UNCHANGED, connection);
}
}
return syndFeedInfo.getSyndFeed();
} finally {
httpConnection.disconnect();
}
} else {
fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
InputStream inputStream = null;
setRequestHeaders(connection, null, userAgent);
httpConnection.connect();
try {
inputStream = httpConnection.getInputStream();
return getSyndFeedFromStream(inputStream, connection);
} catch (final java.io.IOException e) {
handleErrorCodes(((HttpURLConnection) connection).getResponseCode());
} finally {
IO.close(inputStream);
httpConnection.disconnect();
}
// we will never actually get to this line
return null;
}
}
protected void retrieveAndCacheFeed(final URL feedUrl, final SyndFeedInfo syndFeedInfo, final HttpURLConnection connection)
throws IllegalArgumentException, FeedException, FetcherException, IOException {
handleErrorCodes(connection.getResponseCode());
resetFeedInfo(feedUrl, syndFeedInfo, connection);
final FeedFetcherCache cache = getFeedInfoCache();
// resetting feed info in the cache
// could be needed for some implementations
// of FeedFetcherCache (eg, distributed HashTables)
if (cache != null) {
cache.setFeedInfo(feedUrl, syndFeedInfo);
}
}
protected void resetFeedInfo(final URL orignalUrl, final SyndFeedInfo syndFeedInfo, final HttpURLConnection connection) throws IllegalArgumentException,
IOException, FeedException {
// need to always set the URL because this may have changed due to 3xx redirects
syndFeedInfo.setUrl(connection.getURL());
// the ID is a persistant value that should stay the same even if the URL for the
// feed changes (eg, by 3xx redirects)
syndFeedInfo.setId(orignalUrl.toString());
// This will be 0 if the server doesn't support or isn't setting the last modified header
syndFeedInfo.setLastModified(connection.getLastModified());
// This will be null if the server doesn't support or isn't setting the ETag header
syndFeedInfo.setETag(connection.getHeaderField("ETag"));
// get the contents
InputStream inputStream = null;
try {
inputStream = connection.getInputStream();
SyndFeed syndFeed = getSyndFeedFromStream(inputStream, connection);
final String imHeader = connection.getHeaderField("IM");
if (isUsingDeltaEncoding() && imHeader != null && imHeader.contains("feed")) {
final FeedFetcherCache cache = getFeedInfoCache();
if (cache != null && connection.getResponseCode() == 226) {
// client is setup to use http delta encoding and the server supports it and has
// returned a delta encoded response
// This response only includes new items
final SyndFeedInfo cachedInfo = cache.getFeedInfo(orignalUrl);
if (cachedInfo != null) {
final SyndFeed cachedFeed = cachedInfo.getSyndFeed();
// set the new feed to be the orginal feed plus the new items
syndFeed = combineFeeds(cachedFeed, syndFeed);
}
}
}
syndFeedInfo.setSyndFeed(syndFeed);
} finally {
IO.close(inputStream);
}
}
/**
*
* Set appropriate HTTP headers, including conditional get and gzip encoding headers
*
*
* @param connection A URLConnection
* @param syndFeedInfo The SyndFeedInfo for the feed to be retrieved. May be null
* @param userAgent the name of the user-agent to be placed in HTTP-header.
*/
protected void setRequestHeaders(final URLConnection connection, final SyndFeedInfo syndFeedInfo, final String userAgent) {
if (syndFeedInfo != null) {
// set the headers to get feed only if modified
// we support the use of both last modified and eTag headers
if (syndFeedInfo.getLastModified() != null) {
final Object lastModified = syndFeedInfo.getLastModified();
if (lastModified instanceof Long) {
connection.setIfModifiedSince((Long) syndFeedInfo.getLastModified());
}
}
if (syndFeedInfo.getETag() != null) {
connection.setRequestProperty("If-None-Match", syndFeedInfo.getETag());
}
}
// header to retrieve feed gzipped
connection.setRequestProperty("Accept-Encoding", "gzip");
connection.addRequestProperty("User-Agent", userAgent);
if (isUsingDeltaEncoding()) {
connection.addRequestProperty("A-IM", "feed");
}
}
private SyndFeed readSyndFeedFromStream(final InputStream inputStream, final URLConnection connection) throws IOException, IllegalArgumentException,
FeedException {
BufferedInputStream is;
if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
// handle gzip encoded content
is = new BufferedInputStream(new GZIPInputStream(inputStream));
} else {
is = new BufferedInputStream(inputStream);
}
final XmlReader reader;
if (connection.getHeaderField("Content-Type") != null) {
reader = new XmlReader(is, connection.getHeaderField("Content-Type"), true);
} else {
reader = new XmlReader(is, true);
}
final SyndFeedInput syndFeedInput = new SyndFeedInput();
syndFeedInput.setPreserveWireFeed(isPreserveWireFeed());
syndFeedInput.setAllowDoctypes(isAllowDoctypes());
return syndFeedInput.build(reader);
}
private SyndFeed getSyndFeedFromStream(final InputStream inputStream, final URLConnection connection) throws IOException, IllegalArgumentException,
FeedException {
final SyndFeed feed = readSyndFeedFromStream(inputStream, connection);
fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection, feed);
return feed;
}
/**
* @return The FeedFetcherCache used by this fetcher (Could be null)
*/
public synchronized FeedFetcherCache getFeedInfoCache() {
return feedInfoCache;
}
/**
* @param cache The cache to be used by this fetcher (pass null to stop using a cache)
*/
public synchronized void setFeedInfoCache(final FeedFetcherCache cache) {
feedInfoCache = cache;
}
/**
* @param timeout see java.net.URLConnection.setConnectTimeout(int timeout)
*/
public synchronized void setConnectTimeout(final int timeout) {
connectTimeout = timeout;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy