All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.CrawlerJournal Maven / Gradle / Ivy

Go to download

The Archive Commons Code Libraries project contains general Java utility libraries, as used by the Heritrix crawler and other projects.

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;

import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.TextUtils;

/**
 * Utility class for a crawler journal/log that is compressed and 
 * rotates by serial number at checkpoints. 
 * 
 * @author gojomo
 */
public class CrawlerJournal implements Closeable {
    private static final Logger LOGGER = Logger.getLogger(
            CrawlerJournal.class.getName());
    
    /** prefix for error lines*/
    public static final String LOG_ERROR = "E ";
    /** prefix for timestamp lines */
    public static final String LOG_TIMESTAMP = "T ";

    /**
     * Stream on which we record frontier events.
     */
    protected Writer out = null;
    
    /** line count */ 
    protected long lines = 0;
    /** number of lines between timestamps */ 
    protected int timestamp_interval = 0; // 0 means no timestamps
    
    /**
     * File we're writing journal to.
     * Keep a reference in case we want to rotate it off.
     */
    protected File gzipFile = null;
    
    /**
     * Create a new crawler journal at the given location
     * 
     * @param path Directory to make thejournal in.
     * @param filename Name to use for journal file.
     * @throws IOException
     */
    public CrawlerJournal(String path, String filename)
    throws IOException {
        this.gzipFile = new File(path, filename);
        this.out = initialize(gzipFile);
    }
    
    /**
     * Create a new crawler journal at the given location
     * 
     * @param file path at which to make journal
     * @throws IOException
     */
    public CrawlerJournal(File file) throws IOException {
        this.gzipFile = file;
        this.out = initialize(gzipFile);
    }
    
    protected Writer initialize(final File f) throws FileNotFoundException, IOException {
        FileUtils.moveAsideIfExists(f);
        return new OutputStreamWriter(new GZIPOutputStream(
            new FastBufferedOutputStream(new FileOutputStream(f),32*1024)));
    }

    /**
     * Write a line
     * @param strs
     */
    public synchronized void writeLine(String... strs) {
        try {
            for(String s : strs) {
                this.out.write(s);
            }
            this.out.write("\n");
            noteLine();
        } catch (IOException e) {
            LOGGER.log(
                Level.SEVERE,
                "problem writing journal line: "+StringUtils.join(strs), 
                e);
        }
    }

    /**
     * Count and note a line
     * 
     * @throws IOException
     */
    protected void noteLine() throws IOException {
        lines++;
        considerTimestamp();
    }

    /**
     * Write a timestamp line if appropriate
     * 
     * @throws IOException
     */
    protected void considerTimestamp() throws IOException {
        if(timestamp_interval > 0 && lines % timestamp_interval == 0) {
            out.write(LOG_TIMESTAMP);
            out.write(ArchiveUtils.getLog14Date());
            out.write("\n");
        }
    }

    /**
     * Flush and close the underlying IO objects.
     */
    public void close() {
        if (this.out == null) {
            return;
        }
        try {
            this.out.flush();
            this.out.close();
            this.out = null;
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE,"problem closing journal", e);
        }
    }

    /**
     * Note a serious error vioa a special log line
     * 
     * @param err
     */
    public synchronized void seriousError(String err) {
        writeLine(LOG_ERROR+ArchiveUtils.getLog14Date()+" "+err+"\n");
    }

    /**
     * Handle a checkpoint by rotating the current log to a checkpoint-named
     * file and starting a new log. 
     */
    public synchronized void rotateForCheckpoint(Checkpoint checkpointInProgress) {
        if (this.out == null || !this.gzipFile.exists()) {
            return;
        }
        close();

        File newName = new File(this.gzipFile.getParentFile(),
                this.gzipFile.getName() + "." + checkpointInProgress.getName());
        try {
            FileUtils.moveAsideIfExists(newName); 
            if (checkpointInProgress.getForgetAllButLatest()) {
                // merge any earlier checkpointed files into new checkpoint
                // file, taking advantage of the legality of concatenating gzips
                
                File[] oldCheckpointeds = this.gzipFile.getParentFile().listFiles(new FilenameFilter() {
                    @Override
                    public boolean accept(File dir, String name) {
                        String regex = "^" + Pattern.quote(gzipFile.getName()) + "\\.cp\\d{5}-\\d{14}$";
                        return TextUtils.matches(regex, name);
                    }
                });
                Arrays.sort(oldCheckpointeds);
                
                for (int i = 1; i < oldCheckpointeds.length; i++) {
                    FileUtils.appendTo(oldCheckpointeds[0], oldCheckpointeds[i]);
                    oldCheckpointeds[i].delete();
                }
                
                if (oldCheckpointeds.length > 0) {
                    FileUtils.appendTo(oldCheckpointeds[0], this.gzipFile);
                    this.gzipFile.delete();
                    oldCheckpointeds[0].renameTo(newName);
                } else {
                    this.gzipFile.renameTo(newName);
                }
            } else {
                this.gzipFile.renameTo(newName);
            }
            
            // Open new gzip file.
            this.out = initialize(this.gzipFile);
        } catch (IOException ioe) {
            LOGGER.log(Level.SEVERE,"Problem rotating recovery journal", ioe);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy