crawlercommons.sitemaps.sax.RSSHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler-commons Show documentation
Show all versions of crawler-commons Show documentation
crawler-commons is a set of reusable Java components that implement
functionality common to any web crawler.
The newest version!
/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax;
import static crawlercommons.sitemaps.SiteMapParser.LOG;
import static crawlercommons.sitemaps.SiteMapParser.urlIsValid;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.LinkedList;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
/**
* Parse XML document which is assumed to be in RSS format. RSS 2.0 example:
*
*
* {@code
*
*
*
* Lift Off News
* http://liftoff.msfc.nasa.gov/
* Liftoff to Space Exploration.
* en-us
* Tue, 10 Jun 2003 04:00:00 GMT
* Tue, 10 Jun 2003 09:41:01 GMT
* http://blogs.law.harvard.edu/tech/rss
* Weblog Editor 2.0
* [email protected]
* [email protected]
* 5
* -
*
Star City
* http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp
* How do Americans get ready to work with Russians aboard the
* International Space Station? They take a crash course in culture,
* language and protocol at Russia's Star City.
*
* Tue, 03 Jun 2003 09:39:21 GMT
* http://liftoff.msfc.nasa.gov/2003/06/03.html#item573
*
* -
*
Space Exploration
* http://liftoff.msfc.nasa.gov/
* Sky watchers in Europe, Asia, and parts of Alaska and Canada
* will experience a partial eclipse of the Sun on Saturday, May 31.
*
* Fri, 30 May 2003 11:06:42 GMT
* http://liftoff.msfc.nasa.gov/2003/05/30.html#item572
*
*
*
* }
*
*/
class RSSHandler extends DelegatorHandler {
private SiteMap sitemap;
private URL locURL;
private ZonedDateTime lastMod;
boolean valid;
RSSHandler(URL url, LinkedList elementStack, boolean strict) {
super(elementStack, strict);
sitemap = new SiteMap(url);
sitemap.setType(SitemapType.RSS);
}
/*
* (non-Javadoc)
*
* @see crawlercommons.sitemaps.sax.DelegatorHandler#startElement(java.lang
* .String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
}
/*
* (non-Javadoc)
*
* @see crawlercommons.sitemaps.sax.DelegatorHandler#endElement(java.lang
* .String, java.lang.String, java.lang.String)
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ("link".equals(localName)) {
setLocURL();
} else if ("guid".equals(localName)) {
// accept as link if
// - a valid absolute URL (not a URN, UUID or similar)
// - and no found yet
if (locURL == null) {
setLocURL();
}
resetCharacterBuffer();
} else if ("item".equals(localName)) {
maybeAddSiteMapUrl();
} else if ("rss".equals(localName)) {
sitemap.setProcessed(true);
} else if ("pubDate".equals(localName)) {
String value = getAndResetCharacterBuffer();
lastMod = AbstractSiteMap.parseRSSTimestamp(value);
if (lastMod != null && "channel".equals(super.currentElementParent())) {
sitemap.setLastModified(lastMod);
}
}
}
/*
* (non-Javadoc)
*
* @see crawlercommons.sitemaps.sax.DelegatorHandler#characters(char[], int,
* int)
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
String localName = super.currentElement();
if ("pubDate".equals(localName) || "link".equals(localName) || "guid".equals(localName)) {
appendCharacterBuffer(ch, start, length);
}
}
@Override
public AbstractSiteMap getSiteMap() {
return sitemap;
}
private void setLocURL() {
String loc = getAndResetCharacterBuffer();
if (loc == null) {
return;
}
String value = stripAllBlank(loc);
if (value.isEmpty()) {
return;
}
try {
// check that the value is a valid URL
locURL = new URL(sitemap.getUrl(), value);
String urlFiltered = urlFilter.apply(locURL.toString());
if (urlFiltered == null) {
LOG.debug("Filtered URL {}", value);
return;
}
locURL = new URL(urlFiltered);
} catch (MalformedURLException e) {
LOG.debug("Bad url: [{}]", value);
LOG.trace("Can't create an entry with a bad URL", e);
}
}
private void maybeAddSiteMapUrl() {
if (locURL != null) {
boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString());
if (!isStrict() || valid) {
SiteMapURL sUrl = new SiteMapURL(locURL, valid);
sUrl.setLastModified(lastMod);
sitemap.addSiteMapUrl(sUrl);
}
}
locURL = null;
lastMod = null;
}
@Override
public void error(SAXParseException e) throws SAXException {
maybeAddSiteMapUrl();
}
@Override
public void fatalError(SAXParseException e) throws SAXException {
maybeAddSiteMapUrl();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy