crawlercommons.sitemaps.SiteMapTester Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler-commons Show documentation
crawler-commons is a set of reusable Java components that implement functionality common to any web crawler.
The newest version!
/**
 * Copyright 2016 Crawler-Commons
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package crawlercommons.sitemaps;

import java.io.IOException;
import java.net.URL;
import java.util.Collection;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.filters.basic.BasicURLNormalizer;

/**
 * Sitemap Tool for recursively fetching all URL's from a sitemap (and all of
 * it's children)
 **/
public class SiteMapTester {

    private static final Logger LOG = LoggerFactory.getLogger(SiteMapTester.class);
    private static SiteMapParser saxParser = new SiteMapParser(false, true);

    public static void main(String[] args) throws IOException, UnknownFormatException {
        if (args.length < 1) {
            LOG.error("Fetch and process a sitemap (recursively if a sitemap index)");
            LOG.error("Usage: SiteMapTester  [MIME_TYPE]");
            LOG.error("Options:");
            LOG.error("  URL_TO_TEST  URL of sitemap");
            LOG.error("  MIME_TYPE    force processing sitemap as MIME type,");
            LOG.error("               bypass automatic MIME type detection");
            LOG.error("Java properties:");
            LOG.error("  sitemap.strictNamespace");
            LOG.error("                  if true sitemaps are required to use the standard namespace URI");
            LOG.error("  sitemap.allow.dtd");
            LOG.error("                  if true sitemaps are allowed to include a DTD");
            LOG.error("  sitemap.extensions");
            LOG.error("                  if true enable sitemap extension parsing");
            LOG.error("  sitemap.filter.urls");
            LOG.error("                  if true filter and normalize all URLs found in the sitemap");
            LOG.error("                  using crawlercommons.filters.basic.BasicURLNormalizer");
        } else {
            URL url = new URL(args[0]);
            String mt = (args.length > 1) ? args[1] : null;

            parse(url, mt);
        }
    }

    /**
     * Parses a Sitemap recursively meaning that if the sitemap is a
     * sitemapIndex then it parses all of the internal sitemaps
     */
    private static void parse(URL url, String mt) throws IOException, UnknownFormatException {
        byte[] content = IOUtils.toByteArray(url);

        LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""));

        boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
        saxParser.setStrictNamespace(strictNamespace);

        boolean allowDTD = Boolean.getBoolean("sitemap.allow.dtd");
        saxParser.setAllowDocTypeDefinitions(allowDTD);

        boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
        if (enableExtensions) {
            saxParser.enableExtensions();
        }

        boolean enableURLFilter = Boolean.getBoolean("sitemap.filter.urls");
        if (enableURLFilter) {
            saxParser.setURLFilter(new BasicURLNormalizer());
        }

        AbstractSiteMap sm = null;
        // guesses the mimetype
        if (mt == null || mt.equals("")) {
            sm = saxParser.parseSiteMap(content, url);
        } else {
            sm = saxParser.parseSiteMap(mt, content, url);
        }

        if (sm.isIndex()) {
            Collection links = ((SiteMapIndex) sm).getSitemaps();
            for (AbstractSiteMap asm : links) {
                parse(asm.getUrl(), mt); // Recursive call
            }
        } else {
            Collection links = ((SiteMap) sm).getSiteMapUrls();
            for (SiteMapURL smu : links) {
                if (enableExtensions) {
                    LOG.info(smu.toString());
                } else {
                    LOG.info(smu.getUrl().toString());
                }
            }
        }
    }
}