All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.deciderules.TransclusionDecideRule Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.deciderules;

import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.Hop;

/**
 * Rule ACCEPTs any CrawlURIs whose path-from-seed ('hopsPath' -- see
 * {@link CrawlURI#getPathFromSeed()} ends 
 * with at least one, but not more than, the given number of 
 * non-navlink ('L') hops. 
 * 
 * Otherwise, if the path-from-seed is empty or if a navlink ('L') occurs
 * within max-trans-hops of the tail of the path-from-seed, this rule
 * returns PASS.
 *  
 * 

Thus, it allows things like embedded resources (frames/images/media) * and redirects to be transitively included ('transcluded') in a crawl, * even if they otherwise would not, for some reasonable number of hops * (usually 1-5). * * @see Transclusion * * @author gojomo */ public class TransclusionDecideRule extends PredicatedDecideRule { private static final long serialVersionUID = -3975688876990558918L; /** * Maximum number of non-navlink (non-'L') hops to ACCEPT. */ { setMaxTransHops(2); } public int getMaxTransHops() { return (Integer) kp.get("maxTransHops"); } public void setMaxTransHops(int maxTransHops) { kp.put("maxTransHops", maxTransHops); } /** * Maximum number of speculative ('X') hops to ACCEPT. */ { setMaxSpeculativeHops(1); } public int getMaxSpeculativeHops() { return (Integer) kp.get("maxSpeculativeHops"); } public void setMaxSpeculativeHops(int maxSpeculativeHops) { kp.put("maxSpeculativeHops", maxSpeculativeHops); } /** * Usual constructor. */ public TransclusionDecideRule() { } /** * Evaluate whether given object is within the acceptable thresholds of * transitive hops. * * @param curi CrawlURI to make decision on. * @return true if the transitive hops >0 and <= max */ protected boolean evaluate(CrawlURI curi) { String hopsPath = curi.getPathFromSeed(); if (hopsPath == null || hopsPath.length() == 0) { return false; } int allCount = 0; int nonrefCount = 0; int specCount = 0; for (int i = hopsPath.length() - 1; i >= 0; i--) { char c = hopsPath.charAt(i); if (c == Hop.NAVLINK.getHopChar() || c == Hop.SUBMIT.getHopChar()) { // end of hops counted here break; } allCount++; if (c != Hop.REFER.getHopChar()) { nonrefCount++; } if (c == Hop.SPECULATIVE.getHopChar()) { specCount++; } } // transclusion doesn't apply if there isn't at least one non-nav-hop if (allCount <= 0) { return false; } // too many speculative hops disqualify from transclusion if (specCount > getMaxSpeculativeHops()) { return false; } // transclusion applies as long as non-ref hops less than max return nonrefCount <= getMaxTransHops(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy