org.archive.crawler.spring.SheetOverlaysManager Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.spring;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.modules.CrawlURI;
import org.archive.spring.OverlayMapsSource;
import org.archive.spring.Sheet;
import org.archive.util.PrefixFinder;
import org.archive.util.SurtPrefixSet;
import org.springframework.beans.BeansException;
import org.springframework.beans.TypeMismatchException;
import org.springframework.beans.factory.BeanFactory;
import org.springframework.beans.factory.BeanFactoryAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.event.ContextRefreshedEvent;
/**
* Manager which marks-up CrawlURIs with the names of all applicable
* Sheets, and returns overlay maps by name.
*
* @author gojomo
*/
public class SheetOverlaysManager implements
BeanFactoryAware, OverlayMapsSource, ApplicationListener {
private static final Logger logger = Logger.getLogger(SheetOverlaysManager.class.getName());
protected BeanFactory beanFactory;
/** all SheetAssociations by DecideRule evaluation */
protected SortedSet ruleAssociations =
new ConcurrentSkipListSet();
protected NavigableMap> sheetNamesBySurt = new ConcurrentSkipListMap>();
/** all sheets by (bean)name*/
protected Map sheetsByName = new ConcurrentHashMap();
public void setBeanFactory(BeanFactory beanFactory) throws BeansException {
this.beanFactory = beanFactory;
}
/**
* Collect all Sheets, by beanName.
* @param map
*/
@Autowired(required=false)
public void setSheetsByName(Map map) {
this.sheetsByName = map;
}
/**
* Sheets, by name; starts with all autowired Sheets but others
* may be added by other means (mid-crawl reconfiguration).
* @return map of Sheets by their String name
*/
public Map getSheetsByName() {
return this.sheetsByName;
}
/**
* All DecideRuledSheetAssociations, in Ordered order
*
* @return set of DecideRuledSheetAssociation
*/
public SortedSet getRuleAssociations() {
return this.ruleAssociations;
}
/**
* Sheet names, by the SURT prefix to which they should be applied.
*
* @return map of Sheet names by their configured SURT
*/
public NavigableMap> getSheetsNamesBySurt() {
return this.sheetNamesBySurt;
}
/**
* Collect all rule-based SheetAssociations. Typically autowired
* from the set of all DecideRuledSheetAssociation instances.
*/
@Autowired(required=false)
public void addRuleAssociations(Set associations) {
// always keep sorted by order
this.ruleAssociations.clear();
this.ruleAssociations.addAll(associations);
}
public void addRuleAssociation(DecideRuledSheetAssociation assoc) {
this.ruleAssociations.add(assoc);
}
/**
* Collect all SURT-based SheetAssociations. Typically autowired
* from the set of all SurtPrefixesSheetAssociation instances
* declared in the initial configuration.
*/
@Autowired(required=false)
public void addSurtAssociations(List associations) {
for(SurtPrefixesSheetAssociation association : associations) {
addSurtsAssociation(association);
}
}
public void addSurtAssociation(String prefix, String sheetName) {
List sheetNames = sheetNamesBySurt.get(prefix);
if(sheetNames == null) {
sheetNames = new LinkedList();
}
sheetNames.add(sheetName);
sheetNamesBySurt.put(prefix, sheetNames);
}
public boolean removeSurtAssociation(String prefix, String sheetName) {
List sheetNames = sheetNamesBySurt.get(prefix);
if(sheetNames == null) {
// no such association
return false;
}
return sheetNames.remove(sheetName);
}
/**
* Add an individual surtsAssociation to the sheetNamesBySurt map.
*/
public void addSurtsAssociation(SurtPrefixesSheetAssociation assoc) {
for(String prefix : assoc.getSurtPrefixes()) {
for(String s : assoc.getTargetSheetNames()) {
addSurtAssociation(prefix, s);
}
}
}
/**
* Retrieve the named overlay Map.
*
* @see org.archive.spring.OverlayMapsSource#getOverlayMap(java.lang.String)
*/
public Map getOverlayMap(String name) {
Sheet sheet = sheetsByName.get(name);
if (sheet != null) {
return sheet.getMap();
} else {
return null;
}
}
/**
* Ensure all sheets are 'primed' after the entire ApplicatiotnContext
* is assembled. This ensures target HasKeyedProperties beans know
* any long paths by which their properties are addressed, and
* handles (by either PropertyEditor-conversion or a fast-failure)
* any type-mismatches between overlay values and their target
* properties.
* @see org.springframework.context.ApplicationListener#onApplicationEvent(org.springframework.context.ApplicationEvent)
*/
@Override
public void onApplicationEvent(ApplicationEvent event) {
if(event instanceof ContextRefreshedEvent) {
for(Sheet s: sheetsByName.values()) {
s.prime(); // exception if Sheet can't target overridable properties
}
// log warning for any sheets named but not present
HashSet allSheetNames = new HashSet();
for(DecideRuledSheetAssociation assoc : ruleAssociations) {
allSheetNames.addAll(assoc.getTargetSheetNames());
}
for(List names : sheetNamesBySurt.values()) {
allSheetNames.addAll(names);
}
for(String name : allSheetNames) {
if(!sheetsByName.containsKey(name)) {
logger.warning("sheet '"+name+"' referenced but absent");
}
}
}
}
//
// Convenience methods for during-crawl overlay updates
//
/**
* Add to named sheet an overlay of the given bean-path and new value.
* Creates the sheet if it does not already exist; re-primes the sheet
* after the change to inform any targeted beans of new external paths.
*
* Only if/when the sheet is applied via associations will the overlay
* have a noticeably effect. Inserting/mutating/priming sheets should
* only be done in a paused crawl.
*
* @param sheetName sheet name to change (or create)
* @param beanPath target bean-path of overlay
* @param value new value
* @return old value, if any
*/
public Object putSheetOverlay(String sheetName, String beanPath, Object value) {
Sheet sheet = getOrCreateSheet(sheetName);
Object prevVal = sheet.getMap().put(beanPath, value);
try {
sheet.prime();
} catch (TypeMismatchException tme) {
// revert to presumably non-damaging value
sheet.getMap().put(beanPath, prevVal);
throw tme;
}
return prevVal;
}
/**
* Remove the given bean-path overlay in the named sheet.
*
* @param sheetName sheet name from which to remove overlay
* @param beanPath overlay to remove
* @return previous overlay value, if any
*/
public Object removeSheetOverlay(String sheetName, String beanPath) {
Sheet sheet = sheetsByName.get(sheetName);
if(sheet==null) {
return null;
}
// TODO: do all the externalPaths created by priming need eventual cleanup?
return sheet.getMap().remove(beanPath);
}
/**
* Delete a named sheet from all associations and the master named
* sheets map.
* @param sheetName sheet name to delete
* @return true if any associations/sheet actually deleted
*/
public boolean deleteSheet(String sheetName) {
boolean anyDeleted = false;
// remove as target of any ruled-associations
for(DecideRuledSheetAssociation assoc : ruleAssociations) {
anyDeleted |= assoc.getTargetSheetNames().remove(sheetName);
}
// remove as target of any surt-associations
for(List sheetNames : sheetNamesBySurt.values()) {
anyDeleted |= sheetNames.remove(sheetName);
}
anyDeleted |= (null != sheetsByName.remove(sheetName));
return anyDeleted;
}
/**
* Get a Sheet of the given name, or create if it does not already
* exist. Provided for convenience of creating Sheet instances after
* the container has been built.
*
* To have effect as an overlay, the returned Sheet must be:
*
* (1) filled with overlay entries, where the key is a full bean-path
* and the value the alternate overlay value;
* (2) primed via the prime() method, which will throw an exception
* if the target bean-path does not address a compatible overlayable
* value;
* (3) associated to some URIs, by the addSurtAssociation or
* addRuledAssociation methods
*
* @param name Sheet name to create; must be unique
* @return created Sheet
*/
public Sheet getOrCreateSheet(String name) {
Sheet sheet = sheetsByName.get(name);
if(sheet==null) {
sheet = new Sheet();
sheet.setBeanFactory(beanFactory);
sheet.setName(name);
sheet.setMap(new HashMap());
sheetsByName.put(name, sheet);
}
return sheet;
}
/**
* Apply the proper overlays (by Sheet beanName) to the given CrawlURI,
* according to configured associations.
*
* TODO: add guard against redundant application more than once?
* TODO: add mechanism for reapplying overlays after settings change?
* @param curi
*/
public void applyOverlaysTo(CrawlURI curi) {
curi.setOverlayMapsSource(this);
// apply SURT-based overlays
curi.getOverlayNames().clear(); // clear previous info
String effectiveSurt = SurtPrefixSet.getCandidateSurt(curi.getPolicyBasisUURI());
List foundPrefixes = PrefixFinder.findKeys(sheetNamesBySurt, effectiveSurt);
for(String prefix : foundPrefixes) {
for(String name : sheetNamesBySurt.get(prefix)) {
curi.getOverlayNames().add(name);
}
}
// apply deciderule-based overlays
for(DecideRuledSheetAssociation assoc : ruleAssociations) {
try {
if(assoc.getRules().accepts(curi)) {
curi.getOverlayNames().addAll(assoc.getTargetSheetNames());
}
} catch (Exception e) {
logger.log(Level.SEVERE, "problem determining whether to apply overlays, so not applying " + assoc.getTargetSheetNames() + " to " + curi, e);
}
}
// even if no overlays set, let creation of empty list signal
// step has occurred -- helps ensure overlays added once-only
curi.getOverlayNames();
}
}