All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.WriterPoolMember Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io;

import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;

import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.PropertyUtils;



/**
 * Member of {@link WriterPool}.
 * Implements rotating off files, file naming with some guarantee of
 * uniqueness, and position in file. Subclass to pick up functionality for a
 * particular Writer type.
 * @author stack
 * @version $Date$ $Revision$
 */
public abstract class WriterPoolMember implements ArchiveFileConstants {
    private final Logger logger = Logger.getLogger(this.getClass().getName());
    
    public static final String UTF8 = "UTF-8";
    
    /**
     * Default archival-aggregate filename template. 
     * 
     * Under usual assumptions -- hostnames aren't shared among crawling hosts; 
     * processes have unique PIDs and admin ports; timestamps inside one process
     * don't repeat (see UniqueTimestampService); clocks are generally 
     * accurate -- will generate a unique name.
     * 
     * Stands for Internet Archive Heritrix.
     */
    public static final String DEFAULT_TEMPLATE = 
        "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}";
    
    /**
     * Default for file prefix.
     */
    public static final String DEFAULT_PREFIX = "WEB";

    /**
     * Reference to file we're currently writing.
     */
    protected File f = null;

    /**  Output stream for file. */
    protected OutputStream out = null;
    /** Counting stream for metering */
    protected MiserOutputStream countOut = null; 
    
    protected WriterPoolSettings settings; 
    private final String extension;

    /**
     * Creation date for the current file.
     * Set by {@link #createFile()}.
     */
	protected String currentTimestamp = "UNSET!!!";
    
	protected String currentBasename; 
	
    /**
     * A running sequence used making unique file names.
     */
    final private AtomicInteger serialNo;
    
    /**
     * Directories round-robin index.
     */
    protected static int roundRobinIndex = 0;

    /**
     * NumberFormat instance for formatting serial number.
     *
     * Pads serial number with zeros.
     */
    protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
    
    
    /**
     * Buffer to reuse writing streams.
     */
    protected final byte [] scratchbuffer = new byte[4 * 1024];
 
    
    /**
     * Constructor.
     * Takes a stream. Use with caution. There is no upperbound check on size.
     * Will just keep writing.
     * 
     * @param serialNo  used to create unique filename sequences
     * @param out Where to write.
     * @param file File the out is connected to.
     * @param cmprs Compress the content written.
     * @param a14DigitDate If null, we'll write current time.
     * @throws IOException
     */
    protected WriterPoolMember(AtomicInteger serialNo, 
            final OutputStream out, final File file,
            final WriterPoolSettings settings)
    throws IOException {
        this(serialNo, settings, null);
        this.countOut = (out instanceof MiserOutputStream) 
                    ? (MiserOutputStream)out 
                    : new MiserOutputStream(out, settings.getFrequentFlushes());
        this.out = this.countOut; 
        this.f = file;
    }
          
    /**
     * Constructor.
     *
     * @param serialNo  used to create unique filename sequences
     * @param dirs Where to drop files.
     * @param prefix File prefix to use.
     * @param cmprs Compress the records written. 
     * @param maxSize Maximum size for ARC files written.
     * @param template filenaming template to use
     * @param extension Extension to give file.
     */
    public WriterPoolMember(AtomicInteger serialNo,
            final WriterPoolSettings settings, final String extension) {
        this.settings = settings; 
        this.extension = extension;
        this.serialNo = serialNo;
    }

	/**
	 * Call this method just before/after any significant write.
	 *
	 * Call at the end of the writing of a record or just before we start
	 * writing a new record.  Will close current file and open a new file
	 * if file size has passed out maxSize.
	 * 
	 * 

Creates and opens a file if none already open. One use of this method * then is after construction, call this method to add the metadata, then * call {@link #getPosition()} to find offset of first record. * * TODO: perhaps this should be called checkForNewOpen? because it also * handles initial open, even when not rolling oversize * * @exception IOException */ public void checkSize() throws IOException { if (this.out == null || isOversize()) { createFile(); } } /** Check if underlying file has already reached its target size. * @return boolean true if file has reached target size and due to be closed */ public boolean isOversize() { return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes()); } /** * Create a new file. * Rotates off the current Writer and creates a new in its place * to take subsequent writes. Usually called from {@link #checkSize()}. * @return Name of file created. * @throws IOException */ protected String createFile() throws IOException { generateNewBasename(); String name = currentBasename + '.' + this.extension + ((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") + OCCUPIED_SUFFIX; File dir = getNextDirectory(settings.calcOutputDirs()); return createFile(new File(dir, name)); } protected String createFile(final File file) throws IOException { close(); this.f = file; FileOutputStream fos = new FileOutputStream(this.f); this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes()); this.out = this.countOut; logger.fine("Opened " + this.f.getAbsolutePath()); return this.f.getName(); } /** * @param dirs List of File objects that point at directories. * @return Find next directory to write an arc too. If more * than one, it tries to round-robin through each in turn. * @throws IOException */ protected File getNextDirectory(List dirs) throws IOException { if (WriterPoolMember.roundRobinIndex >= dirs.size()) { WriterPoolMember.roundRobinIndex = 0; } File d = null; try { d = checkWriteable((File)dirs. get(WriterPoolMember.roundRobinIndex)); } catch (IndexOutOfBoundsException e) { // Dirs list might be altered underneath us. // If so, we get this exception -- just keep on going. } if (d == null && dirs.size() > 1) { for (Iterator i = dirs.iterator(); d == null && i.hasNext();) { d = checkWriteable((File)i.next()); } } else { WriterPoolMember.roundRobinIndex++; } if (d == null) { throw new IOException("Directories unusable."); } return d; } protected File checkWriteable(File d) { if (d == null) { return d; } try { FileUtils.ensureWriteableDirectory(d); } catch(IOException e) { logger.warning("Directory " + d.getPath() + " is not" + " writeable or cannot be created: " + e.getMessage()); d = null; } return d; } /** * Generate a new basename by interpolating values in the configured * template. Values come from local state, other configured values, and * global system properties. The recommended default template will * generate a unique basename under reasonable assumptions. */ protected void generateNewBasename() { Properties localProps = new Properties(); localProps.setProperty("prefix", settings.getPrefix()); synchronized(this.getClass()) { // ensure that serialNo and timestamp are minted together (never inverted sort order) String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement()); String timestamp17 = ArchiveUtils.getUnique17DigitDate(); String timestamp14 = ArchiveUtils.getUnique14DigitDate(); currentTimestamp = timestamp17; localProps.setProperty("serialno", paddedSerialNumber); localProps.setProperty("timestamp17", timestamp17); localProps.setProperty("timestamp14", timestamp14); } currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(), localProps, System.getProperties()); } /** * Get the file name * * @return the filename, as if uncompressed */ protected String getBaseFilename() { String name = this.f.getName(); if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) { return name.substring(0,name.length() - 3); } else if(settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION + OCCUPIED_SUFFIX)) { return name.substring(0, name.length() - (3 + OCCUPIED_SUFFIX.length())); } else { return name; } } /** * Get this file. * * Used by junit test to test for creation and when {@link WriterPool} wants * to invalidate a file. * * @return The current file. */ public File getFile() { return this.f; } /** * Post write tasks. * * Has side effects. Will open new file if we're at the upper bound. * If we're writing compressed files, it will wrap output stream with a * GZIP writer with side effect that GZIP header is written out on the * stream. * * @exception IOException */ protected void preWriteRecordTasks() throws IOException { if (this.out == null) { createFile(); } if (settings.getCompress()) { // Wrap stream in GZIP Writer. // The below construction immediately writes the GZIP 'default' // header out on the underlying stream. this.out = new CompressedStream(this.out); } } /** * Post file write tasks. * If compressed, finishes up compression and flushes stream so any * subsequent checks get good reading. * * @exception IOException */ protected void postWriteRecordTasks() throws IOException { if (settings.getCompress()) { CompressedStream o = (CompressedStream)this.out; o.finish(); o.flush(); o.end(); this.out = o.getWrappedStream(); } } /** * Position in raw output (typically, physical file). * Used making accounting of bytes written. * @return Position in final media (assuming all flushing completes) * @throws IOException */ public long getPosition() { return (countOut==null)? 0L : this.countOut.getCount(); } public boolean isCompressed() { return settings.getCompress(); } protected void write(final byte [] b) throws IOException { this.out.write(b); } protected void flush() throws IOException { this.out.flush(); } protected void write(byte[] b, int off, int len) throws IOException { this.out.write(b, off, len); } protected void write(int b) throws IOException { this.out.write(b); } /** * Copy bytes from the provided InputStream to the target file/stream being * written. * * @return number of bytes written (normally equal to {@code enforceLength}) * @param is * InputStream to copy bytes from * @param recordLength * expected number of bytes to copy * @param enforceLength * whether to throw an exception if too many/too few bytes are * available from stream * @throws IOException */ protected long copyFrom(final InputStream is, final long recordLength, boolean enforceLength) throws IOException { int read = scratchbuffer.length; long tot = 0; while ((tot < recordLength) && (read = is.read(scratchbuffer)) != -1) { int write = read; // never write more than enforced length write = (int) Math.min(write, recordLength - tot); tot += read; write(scratchbuffer, 0, write); } if (enforceLength && tot != recordLength) { // throw exception if desired for read vs. declared mismatches throw new IOException("Read " + tot + " but expected " + recordLength); } return tot; } public void close() throws IOException { if (this.out == null) { return; } this.out.close(); this.out = null; if (this.f != null && this.f.exists()) { String path = this.f.getAbsolutePath(); if (path.endsWith(OCCUPIED_SUFFIX)) { File f = new File(path.substring(0, path.length() - OCCUPIED_SUFFIX.length())); if (f.exists() & !f.delete()) { logger.warning("Failed delete of " + f); } if (!this.f.renameTo(f)) { logger.warning("Failed rename of " + path); } this.f = f; } logger.fine("Closed " + this.f.getAbsolutePath() + ", size " + this.f.length()); } } protected OutputStream getOutputStream() { return this.out; } /** * An override so we get access to underlying output stream. * and offer an end() that does not accompany closing underlying * stream. * @author stack */ private class CompressedStream extends GZIPOutputStream { public CompressedStream(OutputStream out) throws IOException { super(out); } /** * @return Reference to stream being compressed. */ OutputStream getWrappedStream() { return this.out; } /** * Release the deflater's native process resources, * which otherwise would not occur until either * finalization or DeflaterOutputStream.close() * (which would also close underlying stream). */ public void end() { def.end(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy