All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.spring.SheetOverlaysManager Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 
package org.archive.crawler.spring;

import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.spring.OverlayMapsSource;
import org.archive.spring.Sheet;
import org.archive.util.PrefixFinder;
import org.archive.util.SurtPrefixSet;
import org.springframework.beans.BeansException;
import org.springframework.beans.TypeMismatchException;
import org.springframework.beans.factory.BeanFactory;
import org.springframework.beans.factory.BeanFactoryAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.event.ContextRefreshedEvent;

/**
 * Manager which marks-up CrawlURIs with the names of all applicable 
 * Sheets, and returns overlay maps by name. 
 * 
 * @author gojomo
 */
public class SheetOverlaysManager implements 
BeanFactoryAware, OverlayMapsSource, ApplicationListener {
    private static final Logger logger = Logger.getLogger(SheetOverlaysManager.class.getName());
    

    protected BeanFactory beanFactory; 
    /** all SheetAssociations by DecideRule evaluation */ 
    protected SortedSet ruleAssociations = 
        new ConcurrentSkipListSet();
    protected NavigableMap> sheetNamesBySurt = new ConcurrentSkipListMap>(); 
    
    /** all sheets by (bean)name*/
    protected Map sheetsByName = new ConcurrentHashMap();
    
    public void setBeanFactory(BeanFactory beanFactory) throws BeansException {
        this.beanFactory = beanFactory;
    }
    
    /**
     * Collect all Sheets, by beanName. 
     * @param map
     */
    @Autowired(required=false)
    public void setSheetsByName(Map map) {
        this.sheetsByName = map;
    }
    /**
     * Sheets, by name; starts with all autowired Sheets but others
     * may be added by other means (mid-crawl reconfiguration). 
     * @return map of Sheets by their String name
     */
    public Map getSheetsByName() {
        return this.sheetsByName;
    }

    /**
     * All DecideRuledSheetAssociations, in Ordered order
     *  
     * @return set of DecideRuledSheetAssociation
     */
    public SortedSet getRuleAssociations() {
        return this.ruleAssociations;
    }
    
    /**
     * Sheet names, by the SURT prefix to which they should be applied.
     * 
     * @return map of Sheet names by their configured SURT
     */
    public NavigableMap> getSheetsNamesBySurt() {
        return this.sheetNamesBySurt;
    }
    /**
     * Collect all rule-based SheetAssociations. Typically autowired 
     * from the set of all DecideRuledSheetAssociation instances. 
     */
    @Autowired(required=false)
    public void addRuleAssociations(Set associations) {
        // always keep sorted by order
        this.ruleAssociations.clear();
        this.ruleAssociations.addAll(associations);
    }
    
    public void addRuleAssociation(DecideRuledSheetAssociation assoc) {
        this.ruleAssociations.add(assoc); 
    }

    /**
     * Collect all SURT-based SheetAssociations. Typically autowired 
     * from the set of all SurtPrefixesSheetAssociation instances
     * declared in the initial configuration. 
     */
    @Autowired(required=false)
    public void addSurtAssociations(List associations) {
        for(SurtPrefixesSheetAssociation association : associations) {
            addSurtsAssociation(association);
        }
    }
    
    public void addSurtAssociation(String prefix, String sheetName) {
        List sheetNames = sheetNamesBySurt.get(prefix);
        if(sheetNames == null) {
            sheetNames = new LinkedList();
        }
        sheetNames.add(sheetName); 
        sheetNamesBySurt.put(prefix, sheetNames); 
    }
    
    public boolean removeSurtAssociation(String prefix, String sheetName) {
        List sheetNames = sheetNamesBySurt.get(prefix);
        if(sheetNames == null) {
            // no such association
            return false; 
        }
        return sheetNames.remove(sheetName); 
    }

    /** 
     * Add an individual surtsAssociation to the sheetNamesBySurt map.
     */
    public void addSurtsAssociation(SurtPrefixesSheetAssociation assoc) {
        for(String prefix : assoc.getSurtPrefixes()) {
            for(String s : assoc.getTargetSheetNames()) {
                addSurtAssociation(prefix, s);
            }
        }
    }
    

    /**
     * Retrieve the named overlay Map.
     * 
     * @see org.archive.spring.OverlayMapsSource#getOverlayMap(java.lang.String)
     */
    public Map getOverlayMap(String name) {
        Sheet sheet = sheetsByName.get(name);
        if (sheet != null) {
            return sheet.getMap();
        } else {
            return null;
        }
    }

    /** 
     * Ensure all sheets are 'primed' after the entire ApplicatiotnContext
     * is assembled. This ensures target HasKeyedProperties beans know
     * any long paths by which their properties are addressed, and 
     * handles (by either PropertyEditor-conversion or a fast-failure)
     * any type-mismatches between overlay values and their target
     * properties.
     * @see org.springframework.context.ApplicationListener#onApplicationEvent(org.springframework.context.ApplicationEvent)
     */
    @Override
    public void onApplicationEvent(ApplicationEvent event) {
        if(event instanceof ContextRefreshedEvent) {
            for(Sheet s: sheetsByName.values()) {
                s.prime(); // exception if Sheet can't target overridable properties
            }
            // log warning for any sheets named but not present
            HashSet allSheetNames = new HashSet();
            for(DecideRuledSheetAssociation assoc : ruleAssociations) {
                allSheetNames.addAll(assoc.getTargetSheetNames());
            }
            for(List names : sheetNamesBySurt.values()) {
                allSheetNames.addAll(names);
            }
            for(String name : allSheetNames) {
                if(!sheetsByName.containsKey(name)) {
                    logger.warning("sheet '"+name+"' referenced but absent");
                }
            }
        }
    }
    
    //
    // Convenience methods for during-crawl overlay updates
    //
    
    /**
     * Add to named sheet an overlay of the given bean-path and new value. 
     * Creates the sheet if it does not already exist; re-primes the sheet
     * after the change to inform any targeted beans of new external paths. 
     * 
     * Only if/when the sheet is applied via associations will the overlay 
     * have a noticeably effect. Inserting/mutating/priming sheets should
     * only be done in a paused crawl. 
     * 
     * @param sheetName sheet name to change (or create)
     * @param beanPath target bean-path of overlay
     * @param value new value
     * @return old value, if any
     */
    public Object putSheetOverlay(String sheetName, String beanPath, Object value) {
        Sheet sheet = getOrCreateSheet(sheetName); 
        Object prevVal = sheet.getMap().put(beanPath, value);
        try {
            sheet.prime(); 
        } catch (TypeMismatchException tme) {
            // revert to presumably non-damaging value
            sheet.getMap().put(beanPath, prevVal);
            throw tme;
        }
        return prevVal; 
    }
    
    /**
     * Remove the given bean-path overlay in the named sheet. 
     * 
     * @param sheetName sheet name from which to remove overlay
     * @param beanPath overlay to remove
     * @return previous overlay value, if any
     */
    public Object removeSheetOverlay(String sheetName, String beanPath) {
        Sheet sheet = sheetsByName.get(sheetName); 
        if(sheet==null) {
            return null; 
        }
        // TODO: do all the externalPaths created by priming need eventual cleanup?
        return sheet.getMap().remove(beanPath);
    }
    
    /**
     * Delete a named sheet from all associations and the master named 
     * sheets map. 
     * @param sheetName sheet name to delete
     * @return true if any associations/sheet actually deleted
     */
    public boolean deleteSheet(String sheetName) {
        boolean anyDeleted = false; 
        // remove as target of any ruled-associations
        for(DecideRuledSheetAssociation assoc : ruleAssociations) {
            anyDeleted |= assoc.getTargetSheetNames().remove(sheetName);
        }
        // remove as target of any surt-associations
        for(List sheetNames : sheetNamesBySurt.values()) {
            anyDeleted |= sheetNames.remove(sheetName);            
        }
        anyDeleted |= (null != sheetsByName.remove(sheetName)); 
        return anyDeleted;
    }
    
    /**
     * Get a Sheet of the given name, or create if it does not already 
     * exist. Provided for convenience of creating Sheet instances after 
     * the container has been built. 
     * 
     * To have effect as an overlay, the returned Sheet must be:
     * 
     * (1) filled with overlay entries, where the key is a full bean-path 
     * and the value the alternate overlay value; 
     * (2) primed via the prime() method, which will throw an exception
     * if the target bean-path does not address a compatible overlayable
     * value;
     * (3) associated to some URIs, by the addSurtAssociation or 
     * addRuledAssociation methods
     * 
     * @param name Sheet name to create; must be unique
     * @return created Sheet
     */
    public Sheet getOrCreateSheet(String name) {
        Sheet sheet = sheetsByName.get(name); 
        if(sheet==null) {
            sheet = new Sheet(); 
            sheet.setBeanFactory(beanFactory);
            sheet.setName(name); 
            sheet.setMap(new HashMap());
            sheetsByName.put(name, sheet);
        }
        return sheet;
    }
    
    /**
     * Apply the proper overlays (by Sheet beanName) to the given CrawlURI,
     * according to configured associations.  
     * 
     * TODO: add guard against redundant application more than once? 
     * TODO: add mechanism for reapplying overlays after settings change? 
     * @param curi
     */
    public void applyOverlaysTo(CrawlURI curi) {
        curi.setOverlayMapsSource(this); 
        // apply SURT-based overlays
        curi.getOverlayNames().clear(); // clear previous info
        String effectiveSurt = SurtPrefixSet.getCandidateSurt(curi.getPolicyBasisUURI());
        List foundPrefixes = PrefixFinder.findKeys(sheetNamesBySurt, effectiveSurt);       
        for(String prefix : foundPrefixes) {
            for(String name : sheetNamesBySurt.get(prefix)) {
                curi.getOverlayNames().add(name);
            }
        }
        // apply deciderule-based overlays
        for(DecideRuledSheetAssociation assoc : ruleAssociations) {
            try {
                if(assoc.getRules().accepts(curi)) {
                    curi.getOverlayNames().addAll(assoc.getTargetSheetNames());
                }
            } catch (Exception e) {
                logger.log(Level.SEVERE, "problem determining whether to apply overlays, so not applying " + assoc.getTargetSheetNames() + " to " + curi, e);
            }
        }
        // even if no overlays set, let creation of empty list signal
        // step has occurred -- helps ensure overlays added once-only
        curi.getOverlayNames();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy