com.rometools.rome.io.impl.RSS090Parser Maven / Gradle / Ivy
/*
* Copyright 2004 Sun Microsystems, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.rometools.rome.io.impl;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import com.rometools.rome.feed.WireFeed;
import com.rometools.rome.feed.module.Module;
import com.rometools.rome.feed.rss.Channel;
import com.rometools.rome.feed.rss.Image;
import com.rometools.rome.feed.rss.Item;
import com.rometools.rome.feed.rss.TextInput;
import com.rometools.rome.io.FeedException;
public class RSS090Parser extends BaseWireFeedParser {
private static final String RDF_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
private static final String RSS_URI = "http://my.netscape.com/rdf/simple/0.9/";
private static final String CONTENT_URI = "http://purl.org/rss/1.0/modules/content/";
private static final Namespace RDF_NS = Namespace.getNamespace(RDF_URI);
private static final Namespace RSS_NS = Namespace.getNamespace(RSS_URI);
private static final Namespace CONTENT_NS = Namespace.getNamespace(CONTENT_URI);
public RSS090Parser() {
this("rss_0.9", RSS_NS);
}
protected RSS090Parser(final String type, final Namespace ns) {
super(type, ns);
}
@Override
public boolean isMyType(final Document document) {
final Element rssRoot = document.getRootElement();
final Namespace defaultNS = rssRoot.getNamespace();
final List additionalNSs = rssRoot.getAdditionalNamespaces();
boolean myType = false;
if (defaultNS != null && defaultNS.equals(getRDFNamespace()) && additionalNSs != null) {
for (final Namespace namespace : additionalNSs) {
if (getRSSNamespace().equals(namespace)) {
myType = true;
break;
}
}
}
return myType;
}
@Override
public WireFeed parse(final Document document, final boolean validate, final Locale locale) throws IllegalArgumentException, FeedException {
if (validate) {
validateFeed(document);
}
final Element rssRoot = document.getRootElement();
return parseChannel(rssRoot, locale);
}
protected void validateFeed(final Document document) throws FeedException {
// TODO here we have to validate the Feed against a schema or whatever not sure how to do it
// one posibility would be to inject our own schema for the feed (they don't exist out
// there) to the document, produce an ouput and attempt to parse it again with validation
// turned on. otherwise will have to check the document elements by hand.
}
/**
* Returns the namespace used by RSS elements in document of the RSS version the parser
* supports.
*
* This implementation returns the EMTPY namespace.
*
*
* @return returns the EMPTY namespace.
*/
protected Namespace getRSSNamespace() {
return RSS_NS;
}
/**
* Returns the namespace used by RDF elements in document of the RSS version the parser
* supports.
*
* This implementation returns the EMTPY namespace.
*
*
* @return returns the EMPTY namespace.
*/
protected Namespace getRDFNamespace() {
return RDF_NS;
}
/**
* Returns the namespace used by Content Module elements in document.
*
* This implementation returns the EMTPY namespace.
*
*
* @return returns the EMPTY namespace.
*/
protected Namespace getContentNamespace() {
return CONTENT_NS;
}
/**
* Parses the root element of an RSS document into a Channel bean.
*
* It reads title, link and description and delegates to parseImage, parseItems and
* parseTextInput. This delegation always passes the root element of the RSS document as
* different RSS version may have this information in different parts of the XML tree (no
* assumptions made thanks to the specs variaty)
*
*
* @param rssRoot the root element of the RSS document to parse.
* @return the parsed Channel bean.
*/
protected WireFeed parseChannel(final Element rssRoot, final Locale locale) {
final Channel channel = new Channel(getType());
channel.setStyleSheet(getStyleSheet(rssRoot.getDocument()));
final Element eChannel = rssRoot.getChild("channel", getRSSNamespace());
final Element title = eChannel.getChild("title", getRSSNamespace());
if (title != null) {
channel.setTitle(title.getText());
}
final Element link = eChannel.getChild("link", getRSSNamespace());
if (link != null) {
channel.setLink(link.getText());
}
final Element description = eChannel.getChild("description", getRSSNamespace());
if (description != null) {
channel.setDescription(description.getText());
}
channel.setImage(parseImage(rssRoot));
channel.setTextInput(parseTextInput(rssRoot));
// Unfortunately Microsoft's SSE extension has a special case of effectively putting the
// sharing channel module inside the RSS tag and not inside the channel itself. So we also
// need to look for channel modules from the root RSS element.
final List allFeedModules = new ArrayList();
final List rootModules = parseFeedModules(rssRoot, locale);
final List channelModules = parseFeedModules(eChannel, locale);
if (rootModules != null) {
allFeedModules.addAll(rootModules);
}
if (channelModules != null) {
allFeedModules.addAll(channelModules);
}
channel.setModules(allFeedModules);
channel.setItems(parseItems(rssRoot, locale));
final List foreignMarkup = extractForeignMarkup(eChannel, channel, getRSSNamespace());
if (!foreignMarkup.isEmpty()) {
channel.setForeignMarkup(foreignMarkup);
}
return channel;
}
/**
* This method exists because RSS0.90 and RSS1.0 have the 'item' elements under the root
* elemment. And RSS0.91, RSS0.02, RSS0.93, RSS0.94 and RSS2.0 have the item elements under the
* 'channel' element.
*
*/
protected List getItems(final Element rssRoot) {
return rssRoot.getChildren("item", getRSSNamespace());
}
/**
* This method exists because RSS0.90 and RSS1.0 have the 'image' element under the root
* elemment. And RSS0.91, RSS0.02, RSS0.93, RSS0.94 and RSS2.0 have it under the 'channel'
* element.
*
*/
protected Element getImage(final Element rssRoot) {
return rssRoot.getChild("image", getRSSNamespace());
}
/**
* This method exists because RSS0.90 and RSS1.0 have the 'textinput' element under the root
* elemment. And RSS0.91, RSS0.02, RSS0.93, RSS0.94 and RSS2.0 have it under the 'channel'
* element.
*
*/
protected Element getTextInput(final Element rssRoot) {
return rssRoot.getChild("textinput", getRSSNamespace());
}
/**
* Parses the root element of an RSS document looking for image information.
*
* It reads title and url out of the 'image' element.
*
*
* @param rssRoot the root element of the RSS document to parse for image information.
* @return the parsed image bean.
*/
protected Image parseImage(final Element rssRoot) {
Image image = null;
final Element eImage = getImage(rssRoot);
if (eImage != null) {
image = new Image();
final Element title = eImage.getChild("title", getRSSNamespace());
if (title != null) {
image.setTitle(title.getText());
}
final Element url = eImage.getChild("url", getRSSNamespace());
if (url != null) {
image.setUrl(url.getText());
}
final Element link = eImage.getChild("link", getRSSNamespace());
if (link != null) {
image.setLink(link.getText());
}
}
return image;
}
/**
* Parses the root element of an RSS document looking for all items information.
*
* It iterates through the item elements list, obtained from the getItems() method, and invoke
* parseItem() for each item element. The resulting RSSItem of each item element is stored in a
* list.
*
*
* @param rssRoot the root element of the RSS document to parse for all items information.
* @return a list with all the parsed RSSItem beans.
*/
protected List- parseItems(final Element rssRoot, final Locale locale) {
final List
- items = new ArrayList
- ();
for (final Element item : getItems(rssRoot)) {
items.add(parseItem(rssRoot, item, locale));
}
return items;
}
/**
* Parses an item element of an RSS document looking for item information.
*
* It reads title and link out of the 'item' element.
*
*
* @param rssRoot the root element of the RSS document in case it's needed for context.
* @param eItem the item element to parse.
* @return the parsed RSSItem bean.
*/
protected Item parseItem(final Element rssRoot, final Element eItem, final Locale locale) {
final Item item = new Item();
final Element title = eItem.getChild("title", getRSSNamespace());
if (title != null) {
item.setTitle(title.getText());
}
final Element link = eItem.getChild("link", getRSSNamespace());
if (link != null) {
item.setLink(link.getText());
item.setUri(link.getText());
}
item.setModules(parseItemModules(eItem, locale));
final List
foreignMarkup = extractForeignMarkup(eItem, item, getRSSNamespace());
// content:encoded elements are treated special, without a module, they have to be removed
// from the foreign markup to avoid duplication in case of read/write. Note that this fix
// will break if a content module is used
final Iterator iterator = foreignMarkup.iterator();
while (iterator.hasNext()) {
final Element element = iterator.next();
final Namespace eNamespace = element.getNamespace();
final String eName = element.getName();
if (getContentNamespace().equals(eNamespace) && eName.equals("encoded")) {
iterator.remove();
}
}
if (!foreignMarkup.isEmpty()) {
item.setForeignMarkup(foreignMarkup);
}
return item;
}
/**
* Parses the root element of an RSS document looking for text-input information.
*
* It reads title, description, name and link out of the 'textinput' or 'textInput' element.
*
*
* @param rssRoot the root element of the RSS document to parse for text-input information.
* @return the parsed RSSTextInput bean.
*/
protected TextInput parseTextInput(final Element rssRoot) {
TextInput textInput = null;
final Element eTextInput = getTextInput(rssRoot);
if (eTextInput != null) {
textInput = new TextInput();
final Element title = eTextInput.getChild("title", getRSSNamespace());
if (title != null) {
textInput.setTitle(title.getText());
}
final Element description = eTextInput.getChild("description", getRSSNamespace());
if (description != null) {
textInput.setDescription(description.getText());
}
final Element name = eTextInput.getChild("name", getRSSNamespace());
if (name != null) {
textInput.setName(name.getText());
}
final Element link = eTextInput.getChild("link", getRSSNamespace());
if (link != null) {
textInput.setLink(link.getText());
}
}
return textInput;
}
}