All Downloads are FREE. Search and download functionalities are using the official Maven repository.

crawlercommons.sitemaps.sax.XMLHandler Maven / Gradle / Ivy

Go to download

crawler-commons is a set of reusable Java components that implement functionality common to any web crawler.

The newest version!
/**
 * Copyright 2016 Crawler-Commons
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package crawlercommons.sitemaps.sax;

import static crawlercommons.sitemaps.SiteMapParser.LOG;
import static crawlercommons.sitemaps.SiteMapParser.urlIsValid;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.sax.extension.ExtensionHandler;

/**
 * Parse XML that contains a valid Sitemap. Example of a Sitemap:
 * 
 * 
 * {@code
 * 
 *   
 *     
 *       http://www.example.com/
 *       lastmod>2005-01-01
 *       monthly
 *       0.8
 *     
 *     
 *       http://www.example.com/catalog?item=12&desc=vacation_hawaii
 *       weekly
 *     
 *   
 * }
 * 
*/ class XMLHandler extends DelegatorHandler { private SiteMap sitemap; private String loc; private String lastMod; private String changeFreq; private String priority; private int i = 0; private boolean currentElementNamespaceIsValid; private String currentElementNamespace; protected Map extensionHandlers; XMLHandler(URL url, LinkedList elementStack, boolean strict) { super(elementStack, strict); sitemap = new SiteMap(url); sitemap.setType(SitemapType.XML); } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { currentElementNamespace = uri; if (isExtensionNamespace(uri)) { ExtensionHandler eh = getExtensionHandler(uri); eh.startElement(uri, localName, qName, attributes); return; } else if (isStrictNamespace() && !isAcceptedNamespace(uri)) { LOG.debug("Skip element <{}>, namespace <{}> not accepted", localName, uri); currentElementNamespaceIsValid = false; return; } currentElementNamespaceIsValid = true; // flush any unclosed or missing URL element if ("loc".equals(localName) || "url".equals(localName)) { if (loc == null) { loc = getAndResetCharacterBuffer(); } if (loc != null && !isAllBlank(loc)) { maybeAddSiteMapUrl(); return; } loc = null; if ("url".equals(localName)) { // reset also attributes lastMod = null; changeFreq = null; priority = null; } } resetCharacterBuffer(); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (isExtensionNamespace(uri)) { ExtensionHandler eh = getExtensionHandler(uri); eh.endElement(uri, localName, qName); return; } else if (isStrictNamespace() && !isAcceptedNamespace(uri)) { return; } if ("url".equals(localName)) { if ("urlset".equals(currentElementParent())) { maybeAddSiteMapUrl(); } } else if ("urlset".equals(localName)) { sitemap.setProcessed(true); } else if ("loc".equals(localName)) { loc = getAndResetCharacterBuffer(); } else if ("changefreq".equals(localName)) { changeFreq = getAndResetCharacterBuffer(); } else if ("lastmod".equals(localName)) { lastMod = getAndResetCharacterBuffer(); } else if ("priority".equals(localName)) { priority = getAndResetCharacterBuffer(); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (isExtensionNamespace(currentElementNamespace)) { ExtensionHandler eh = getExtensionHandler(currentElementNamespace); eh.characters(ch, start, length); return; } else if (isStrictNamespace() && !currentElementNamespaceIsValid) { return; } String localName = currentElement(); if ("loc".equals(localName) || "url".equals(localName) || "changefreq".equals(localName) || "lastmod".equals(localName) || "priority".equals(localName)) { appendCharacterBuffer(ch, start, length); } } @Override public AbstractSiteMap getSiteMap() { return sitemap; } private void maybeAddSiteMapUrl() { String value = null; if (loc != null) { value = stripAllBlank(loc); } else if ("loc".equals(currentElement())) { value = getAndResetCharacterBuffer(); } if (value == null || isAllBlank(value)) { return; } String urlFiltered = urlFilter.apply(value); if (urlFiltered == null) { LOG.debug("Filtered URL {}", value); return; } try { // check that the value is a valid URL URL locURL = new URL(urlFiltered); boolean valid = urlIsValid(sitemap.getBaseUrl(), locURL.toString()); if (valid || !isStrict()) { SiteMapURL sUrl = new SiteMapURL(locURL, valid); sUrl.setLastModified(lastMod); sUrl.setChangeFrequency(changeFreq); sUrl.setPriority(priority); sitemap.addSiteMapUrl(sUrl); LOG.debug(" {}. {}", (++i), sUrl); if (extensionHandlers != null) { for (Entry e : extensionHandlers.entrySet()) { sUrl.addAttributesForExtension(e.getKey(), e.getValue().getAttributes()); } } } } catch (MalformedURLException e) { LOG.debug("Bad url: [{}]", value); LOG.trace("Can't create an entry with a bad URL", e); } finally { loc = null; lastMod = null; changeFreq = null; priority = null; resetExtensionHandlers(); } } /** * Registers and returns an ExtensionHandler instance bound to this handler * * @param uri * URI of sitemap extension namespace * @return handler for the sitemap extension defined by XML namespace */ protected ExtensionHandler getExtensionHandler(String uri) { if (extensionNamespaces.containsKey(uri)) { Extension ext = extensionNamespaces.get(uri); if (extensionHandlers == null) { extensionHandlers = new TreeMap<>(); } if (!extensionHandlers.containsKey(ext)) { extensionHandlers.put(ext, ExtensionHandler.create(ext)); } return extensionHandlers.get(ext); } return null; } protected Collection getExtensionHandlers() { if (extensionHandlers == null) { return new ArrayList(); } return extensionHandlers.values(); } /** * Reset all extension handlers. Attributes of sitemap extensions are bound * to a single {@link SiteMapURL}, handlers should be reset if a sitemap URL * is closed. */ public void resetExtensionHandlers() { if (extensionHandlers != null) { for (Entry e : extensionHandlers.entrySet()) { e.getValue().reset(); } } } @Override public void error(SAXParseException e) throws SAXException { maybeAddSiteMapUrl(); } @Override public void fatalError(SAXParseException e) throws SAXException { maybeAddSiteMapUrl(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy