org.archive.crawler.framework.CheckpointService Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.comparator.LastModifiedFileComparator;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.reporting.CrawlStatSnapshot;
import org.archive.spring.ConfigPath;
import org.archive.spring.ConfigPathConfigurer;
import org.archive.spring.HasValidator;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.Lifecycle;
import org.springframework.context.support.AbstractApplicationContext;
import org.springframework.validation.Validator;
/**
* Executes checkpoints, and offers convenience methods for enumerating
* available Checkpoints and injecting a recovery-Checkpoint after
* build and before launch (setRecoveryCheckpointByName).
*
* Offers optional automatic checkpointing at a configurable interval
* in minutes.
*
* @author stack
* @author gojomo
* @author pjack
*/
public class CheckpointService implements Lifecycle, ApplicationContextAware, HasValidator {
private final static Logger LOGGER =
Logger.getLogger(CheckpointService.class.getName());
/** Next overall series checkpoint number */
protected int nextCheckpointNumber = 1;
protected Checkpoint checkpointInProgress;
protected Checkpoint lastCheckpoint;
protected CrawlStatSnapshot lastCheckpointSnapshot = null;
/** service for auto-checkpoint tasks at an interval */
protected Timer timer = new Timer(true);
protected TimerTask checkpointTask = null;
protected ConfigPath checkpointsDir =
new ConfigPath("checkpoints subdirectory","checkpoints");
public ConfigPath getCheckpointsDir() {
return checkpointsDir;
}
/**
* Checkpoints directory
*/
public void setCheckpointsDir(ConfigPath checkpointsDir) {
this.checkpointsDir = checkpointsDir;
}
protected long checkpointIntervalMinutes = -1;
public long getCheckpointIntervalMinutes() {
return checkpointIntervalMinutes;
}
/**
* Period at which to create automatic checkpoints; -1 means
* no auto checkpointing.
*/
public void setCheckpointIntervalMinutes(long interval) {
long oldVal = checkpointIntervalMinutes;
this.checkpointIntervalMinutes = interval;
if(checkpointIntervalMinutes!=oldVal) {
setupCheckpointTask();
}
}
protected boolean forgetAllButLatest = false;
public boolean getForgetAllButLatest() {
return forgetAllButLatest;
}
/**
* True to save only the latest checkpoint, false to save all of them.
* Default is false.
*/
public void setForgetAllButLatest(boolean forgetAllButLatest) {
boolean oldVal = this.forgetAllButLatest;
this.forgetAllButLatest = forgetAllButLatest;
if (this.forgetAllButLatest != oldVal) {
setupCheckpointTask();
}
}
protected Checkpoint recoveryCheckpoint;
@Autowired(required=false)
public void setRecoveryCheckpoint(Checkpoint checkpoint) {
this.recoveryCheckpoint = checkpoint;
checkpoint.getCheckpointDir().setBase(getCheckpointsDir());
}
public Checkpoint getRecoveryCheckpoint() {
return this.recoveryCheckpoint;
}
protected CrawlController controller;
public CrawlController getCrawlController() {
return this.controller;
}
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
// ApplicationContextAware implementation, for eventing
protected AbstractApplicationContext appCtx;
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appCtx = (AbstractApplicationContext)applicationContext;
}
/**
* Create a new Checkpointer
*/
public CheckpointService() {
}
public synchronized void start() {
if (isRunning) {
return;
}
// report if checkpoint incomplete/invalid
if(getRecoveryCheckpoint()!=null) {
File cpDir = getRecoveryCheckpoint().getCheckpointDir().getFile();
if(!Checkpoint.hasValidStamp(cpDir)) {
LOGGER.severe(
"checkpoint '"+cpDir.getAbsolutePath()
+"' missing validity stamp file; checkpoint data "
+"may be missing or otherwise corrupt.");
}
this.lastCheckpoint = getRecoveryCheckpoint();
String serial = getRecoveryCheckpoint().getShortName().substring(2);
try {
Number lastCheckpointNumber = Checkpoint.INDEX_FORMAT.parse(serial);
this.nextCheckpointNumber = lastCheckpointNumber.intValue() + 1;
} catch (ParseException e) {
LOGGER.warning("failed to parse serial from " + lastCheckpoint.getShortName() + " - " + e);
}
}
this.isRunning = true;
setupCheckpointTask();
}
/**
* Setup checkpointTask according to current interval. (An already-scheduled
* task, if any, is canceled.)
*/
protected synchronized void setupCheckpointTask() {
if(checkpointTask!=null) {
checkpointTask.cancel();
}
if(!isRunning) {
// don't setup before start (or after finish), even if
// triggered by interval change
return;
}
// Convert period from minutes to milliseconds.
long periodMs = getCheckpointIntervalMinutes() * (60L * 1000L);
if(periodMs<=0) {
return;
}
checkpointTask = new TimerTask() {
public void run() {
if (isCheckpointing()) {
LOGGER.info("CheckpointTimerThread skipping checkpoint, " +
"already checkpointing: State: " +
controller.getState());
return;
}
LOGGER.info("TimerThread request checkpoint");
requestCrawlCheckpoint();
}
};
this.timer.schedule(checkpointTask, periodMs, periodMs);
LOGGER.info("Installed Checkpoint TimerTask to checkpoint every " +
periodMs + " milliseconds.");
}
protected boolean isRunning = false;
public synchronized boolean isRunning() {
return isRunning;
}
public synchronized void stop() {
LOGGER.info("Cleaned up Checkpoint TimerThread.");
this.timer.cancel();
this.isRunning = false;
}
/**
* @return Returns the nextCheckpoint index.
*/
public int getNextCheckpointNumber() {
return this.nextCheckpointNumber;
}
/**
* Run a checkpoint of the crawler
*/
public synchronized String requestCrawlCheckpoint() throws IllegalStateException {
if (!controller.hasStarted()) {
LOGGER.info("crawl job has not started; ignoring");
return null;
}
if (isCheckpointing()) {
throw new IllegalStateException("Checkpoint already running.");
}
// prevent redundant auto-checkpoints when crawler paused or stopping
if(controller.isPaused() || controller.getState().equals(CrawlController.State.STOPPING)) {
if (controller.getStatisticsTracker().getSnapshot().sameProgressAs(lastCheckpointSnapshot)) {
LOGGER.info("no progress since last checkpoint; ignoring");
System.err.println("no progress since last checkpoint; ignoring");
return null;
}
}
long checkpointStart = System.currentTimeMillis();
Map toCheckpoint = appCtx.getBeansOfType(Checkpointable.class);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("checkpointing beans " + toCheckpoint);
}
checkpointInProgress = new Checkpoint();
try {
checkpointInProgress.setForgetAllButLatest(getForgetAllButLatest());
checkpointInProgress.generateFrom(getCheckpointsDir(),
getNextCheckpointNumber());
// pre (incl. acquire necessary locks)
long startStart = System.currentTimeMillis();
for (Checkpointable c : toCheckpoint.values()) {
c.startCheckpoint(checkpointInProgress);
}
LOGGER.info("all startCheckpoint() completed in "
+ (System.currentTimeMillis() - startStart) + "ms");
// flush/write
long doStart = System.currentTimeMillis();
for (Checkpointable c : toCheckpoint.values()) {
long doMs = System.currentTimeMillis();
c.doCheckpoint(checkpointInProgress);
long doDuration = System.currentTimeMillis() - doMs;
LOGGER.fine("doCheckpoint() " + c + " in " + doDuration + "ms");
}
LOGGER.info("all doCheckpoint() completed in "
+ (System.currentTimeMillis() - doStart) + "ms");
if (getForgetAllButLatest() && lastCheckpoint != null) {
try {
long deleteStart = System.currentTimeMillis();
FileUtils.deleteDirectory(lastCheckpoint.getCheckpointDir().getFile());
lastCheckpoint = null;
LOGGER.info("deleted old checkpoint in "
+ (System.currentTimeMillis() - deleteStart) + "ms");
} catch (IOException e) {
LOGGER.log(Level.SEVERE,
"problem deleting last checkpoint directory "
+ lastCheckpoint.getCheckpointDir().getFile(),
e);
}
}
checkpointInProgress.setSuccess(true);
appCtx.publishEvent(new CheckpointSuccessEvent(this,
checkpointInProgress));
// Record the stats associated with this successfully-completed checkpoint:
lastCheckpointSnapshot = controller.getStatisticsTracker().getSnapshot();
} catch (Exception e) {
checkpointFailed(e);
} finally {
checkpointInProgress.writeValidity(
controller.getStatisticsTracker().getProgressStamp());
// close (incl. release locks)
long finishStart = System.currentTimeMillis();
for (Checkpointable c : toCheckpoint.values()) {
c.finishCheckpoint(checkpointInProgress);
}
LOGGER.info("all finishCheckpoint() completed in "
+ (System.currentTimeMillis() - finishStart) + "ms");
}
LOGGER.info("completed checkpoint " + checkpointInProgress.getName()
+ " in " + (System.currentTimeMillis() - checkpointStart) + "ms");
this.nextCheckpointNumber++;
String nameToReport = checkpointInProgress.getSuccess() ? checkpointInProgress.getName() : null;
this.lastCheckpoint = this.checkpointInProgress;
this.checkpointInProgress = null;
return nameToReport;
}
/**
* @return True if a checkpoint is in progress.
*/
public boolean isCheckpointing() {
return this.checkpointInProgress != null;
}
/**
* Note that a checkpoint failed
*
* @param e Exception checkpoint failed on.
*/
protected void checkpointFailed(Exception e) {
LOGGER.log(Level.SEVERE, " Checkpoint failed", e);
}
protected void checkpointFailed(final String message) {
LOGGER.warning(message);
}
public boolean hasAvailableCheckpoints() {
if(getRecoveryCheckpoint()!=null || isRunning()) {
return false;
}
return (findAvailableCheckpointDirectories() != null
&& findAvailableCheckpointDirectories().size() > 0);
}
/**
* Returns a list of available, valid (contains 'valid' file)
* checkpoint directories, as File instances, with the more
* recently-written appearing first.
*
* @return List of valid checkpoint directory File instances
*/
@SuppressWarnings("unchecked")
public List findAvailableCheckpointDirectories() {
File[] dirs = getCheckpointsDir().getFile().listFiles((FileFilter)FileFilterUtils.directoryFileFilter());
if (dirs == null) {
return Collections.EMPTY_LIST;
}
Arrays.sort(dirs, LastModifiedFileComparator.LASTMODIFIED_REVERSE);
LinkedList dirsList = new LinkedList(Arrays.asList(dirs));
Iterator iter = dirsList.iterator();
while(iter.hasNext()) {
File cpDir = iter.next();
if(!Checkpoint.hasValidStamp(cpDir)) {
LOGGER.warning("checkpoint '"+cpDir+"' missing validity stamp file; ignoring");
iter.remove();
}
}
return dirsList;
}
/**
* Given the name of a valid checkpoint subdirectory in the checkpoints
* directory, create a Checkpoint instance, and insert it into all
* Checkpointable beans.
*
* @param selectedCheckpoint
*/
public synchronized void setRecoveryCheckpointByName(String selectedCheckpoint) {
if(isRunning) {
throw new RuntimeException("may not set recovery Checkpoint after launch");
}
// If the selectedCheckpoint is 'latest', pick up the latest checkpoint
// and use that:
if ("latest".equalsIgnoreCase(selectedCheckpoint)) {
List cps = this.findAvailableCheckpointDirectories();
if (cps == null || cps.size() == 0) {
LOGGER.warning(
"Cannot find any checkpoints so cannot choose the latest one! Assuming we should launch a new crawl.");
return;
}
// As per the API above, the most recent checkpoint is the first in
// the list:
File latestFile = cps.get(0);
// For the checkpoint we use the folder name:
selectedCheckpoint = latestFile.getName();
}
// Now setup the checkpoint:
Checkpoint recoveryCheckpoint = new Checkpoint();
recoveryCheckpoint.getCheckpointDir().setBase(getCheckpointsDir());
recoveryCheckpoint.getCheckpointDir().setPath(selectedCheckpoint);
recoveryCheckpoint.getCheckpointDir().setConfigurer(appCtx.getBean(ConfigPathConfigurer.class));
recoveryCheckpoint.afterPropertiesSet();
setRecoveryCheckpoint(recoveryCheckpoint);
Map toSetRecovery = appCtx.getBeansOfType(Checkpointable.class);
for(Checkpointable c : toSetRecovery.values()) {
c.setRecoveryCheckpoint(recoveryCheckpoint);
}
}
protected static Validator VALIDATOR = new CheckpointValidator();
@Override
public Validator getValidator() {
return VALIDATOR;
}
} //EOC