All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.deciderules.HopCrossesAssignmentLevelDomainDecideRule Maven / Gradle / Ivy

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.deciderules;

import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.net.PublicSuffixes;
import org.archive.net.UURI;

/**
 * Applies its decision if the current URI differs in that portion of
 * its hostname/domain that is assigned/sold by registrars, its
 * 'assignment-level-domain' (ALD) (AKA 'public suffix' or in previous 
 * Heritrix versions, 'topmost assigned SURT')
 * 
 * @author Olaf Freyer
 */
public class HopCrossesAssignmentLevelDomainDecideRule extends PredicatedDecideRule {
    private static final long serialVersionUID = 1L;

    private static final Logger LOGGER = Logger
            .getLogger(HopCrossesAssignmentLevelDomainDecideRule.class.getName());
    
    public HopCrossesAssignmentLevelDomainDecideRule() {
    }

    protected boolean evaluate(CrawlURI uri) {
        UURI via = uri.getVia();
        if (via == null) {
            return false;
        }
        try {
            // determine if this hop crosses assignment-level-domain borders
            String ald = getAssignmentLevelSurt(uri.getUURI());
            String viaAld = getAssignmentLevelSurt(via);
            if (ald != null && !ald.equals(viaAld)) {
                if(LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine("rule matched for \"" + ald+"\" vs. \""+viaAld+"\"");
                }
                return true;
            }
        } catch (Exception e) {
            LOGGER.log(Level.WARNING,"uri="+uri+" via="+via, e);
            // Return false since we could not get hostname or something else
            // went wrong
        }
        return false;
    }
    
    private String getAssignmentLevelSurt(UURI uuri){
        String surt = uuri.getSurtForm().replaceFirst(".*://\\((.*?)\\).*", "$1");
        return PublicSuffixes.reduceSurtToAssignmentLevel(surt);
        
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy