org.archive.io.CrawlerJournal Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-commons Show documentation
Show all versions of heritrix-commons Show documentation
The Archive Commons Code Libraries project contains general Java utility
libraries, as used by the Heritrix crawler and other projects.
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.TextUtils;
/**
* Utility class for a crawler journal/log that is compressed and
* rotates by serial number at checkpoints.
*
* @author gojomo
*/
public class CrawlerJournal implements Closeable {
private static final Logger LOGGER = Logger.getLogger(
CrawlerJournal.class.getName());
/** prefix for error lines*/
public static final String LOG_ERROR = "E ";
/** prefix for timestamp lines */
public static final String LOG_TIMESTAMP = "T ";
/**
* Stream on which we record frontier events.
*/
protected Writer out = null;
/** line count */
protected long lines = 0;
/** number of lines between timestamps */
protected int timestamp_interval = 0; // 0 means no timestamps
/**
* File we're writing journal to.
* Keep a reference in case we want to rotate it off.
*/
protected File gzipFile = null;
/**
* Create a new crawler journal at the given location
*
* @param path Directory to make thejournal in.
* @param filename Name to use for journal file.
* @throws IOException
*/
public CrawlerJournal(String path, String filename)
throws IOException {
this.gzipFile = new File(path, filename);
this.out = initialize(gzipFile);
}
/**
* Create a new crawler journal at the given location
*
* @param file path at which to make journal
* @throws IOException
*/
public CrawlerJournal(File file) throws IOException {
this.gzipFile = file;
this.out = initialize(gzipFile);
}
protected Writer initialize(final File f) throws FileNotFoundException, IOException {
FileUtils.moveAsideIfExists(f);
return new OutputStreamWriter(new GZIPOutputStream(
new FastBufferedOutputStream(new FileOutputStream(f),32*1024)));
}
/**
* Write a line
* @param strs
*/
public synchronized void writeLine(String... strs) {
try {
for(String s : strs) {
this.out.write(s);
}
this.out.write("\n");
noteLine();
} catch (IOException e) {
LOGGER.log(
Level.SEVERE,
"problem writing journal line: "+StringUtils.join(strs),
e);
}
}
/**
* Count and note a line
*
* @throws IOException
*/
protected void noteLine() throws IOException {
lines++;
considerTimestamp();
}
/**
* Write a timestamp line if appropriate
*
* @throws IOException
*/
protected void considerTimestamp() throws IOException {
if(timestamp_interval > 0 && lines % timestamp_interval == 0) {
out.write(LOG_TIMESTAMP);
out.write(ArchiveUtils.getLog14Date());
out.write("\n");
}
}
/**
* Flush and close the underlying IO objects.
*/
public void close() {
if (this.out == null) {
return;
}
try {
this.out.flush();
this.out.close();
this.out = null;
} catch (IOException e) {
LOGGER.log(Level.SEVERE,"problem closing journal", e);
}
}
/**
* Note a serious error vioa a special log line
*
* @param err
*/
public synchronized void seriousError(String err) {
writeLine(LOG_ERROR+ArchiveUtils.getLog14Date()+" "+err+"\n");
}
/**
* Handle a checkpoint by rotating the current log to a checkpoint-named
* file and starting a new log.
*/
public synchronized void rotateForCheckpoint(Checkpoint checkpointInProgress) {
if (this.out == null || !this.gzipFile.exists()) {
return;
}
close();
File newName = new File(this.gzipFile.getParentFile(),
this.gzipFile.getName() + "." + checkpointInProgress.getName());
try {
FileUtils.moveAsideIfExists(newName);
if (checkpointInProgress.getForgetAllButLatest()) {
// merge any earlier checkpointed files into new checkpoint
// file, taking advantage of the legality of concatenating gzips
File[] oldCheckpointeds = this.gzipFile.getParentFile().listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
String regex = "^" + Pattern.quote(gzipFile.getName()) + "\\.cp\\d{5}-\\d{14}$";
return TextUtils.matches(regex, name);
}
});
Arrays.sort(oldCheckpointeds);
for (int i = 1; i < oldCheckpointeds.length; i++) {
FileUtils.appendTo(oldCheckpointeds[0], oldCheckpointeds[i]);
oldCheckpointeds[i].delete();
}
if (oldCheckpointeds.length > 0) {
FileUtils.appendTo(oldCheckpointeds[0], this.gzipFile);
this.gzipFile.delete();
oldCheckpointeds[0].renameTo(newName);
} else {
this.gzipFile.renameTo(newName);
}
} else {
this.gzipFile.renameTo(newName);
}
// Open new gzip file.
this.out = initialize(this.gzipFile);
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"Problem rotating recovery journal", ioe);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy