org.archive.io.WriterPoolMember Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.PropertyUtils;
/**
* Member of {@link WriterPool}.
* Implements rotating off files, file naming with some guarantee of
* uniqueness, and position in file. Subclass to pick up functionality for a
* particular Writer type.
* @author stack
* @version $Date$ $Revision$
*/
public abstract class WriterPoolMember implements ArchiveFileConstants {
private final Logger logger = Logger.getLogger(this.getClass().getName());
public static final String UTF8 = "UTF-8";
/**
* Default archival-aggregate filename template.
*
* Under usual assumptions -- hostnames aren't shared among crawling hosts;
* processes have unique PIDs and admin ports; timestamps inside one process
* don't repeat (see UniqueTimestampService); clocks are generally
* accurate -- will generate a unique name.
*
* Stands for Internet Archive Heritrix.
*/
public static final String DEFAULT_TEMPLATE =
"${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}";
/**
* Default for file prefix.
*/
public static final String DEFAULT_PREFIX = "WEB";
/**
* Reference to file we're currently writing.
*/
protected File f = null;
/** Output stream for file. */
protected OutputStream out = null;
/** Counting stream for metering */
protected MiserOutputStream countOut = null;
protected WriterPoolSettings settings;
private final String extension;
/**
* Creation date for the current file.
* Set by {@link #createFile()}.
*/
protected String currentTimestamp = "UNSET!!!";
protected String currentBasename;
/**
* A running sequence used making unique file names.
*/
final private AtomicInteger serialNo;
/**
* Directories round-robin index.
*/
protected static int roundRobinIndex = 0;
/**
* NumberFormat instance for formatting serial number.
*
* Pads serial number with zeros.
*/
protected static NumberFormat serialNoFormatter = new DecimalFormat("00000");
/**
* Buffer to reuse writing streams.
*/
protected final byte [] scratchbuffer = new byte[4 * 1024];
/**
* Constructor.
* Takes a stream. Use with caution. There is no upperbound check on size.
* Will just keep writing.
*
* @param serialNo used to create unique filename sequences
* @param out Where to write.
* @param file File the out
is connected to.
* @param cmprs Compress the content written.
* @param a14DigitDate If null, we'll write current time.
* @throws IOException
*/
protected WriterPoolMember(AtomicInteger serialNo,
final OutputStream out, final File file,
final WriterPoolSettings settings)
throws IOException {
this(serialNo, settings, null);
this.countOut = (out instanceof MiserOutputStream)
? (MiserOutputStream)out
: new MiserOutputStream(out, settings.getFrequentFlushes());
this.out = this.countOut;
this.f = file;
}
/**
* Constructor.
*
* @param serialNo used to create unique filename sequences
* @param dirs Where to drop files.
* @param prefix File prefix to use.
* @param cmprs Compress the records written.
* @param maxSize Maximum size for ARC files written.
* @param template filenaming template to use
* @param extension Extension to give file.
*/
public WriterPoolMember(AtomicInteger serialNo,
final WriterPoolSettings settings, final String extension) {
this.settings = settings;
this.extension = extension;
this.serialNo = serialNo;
}
/**
* Call this method just before/after any significant write.
*
* Call at the end of the writing of a record or just before we start
* writing a new record. Will close current file and open a new file
* if file size has passed out maxSize.
*
* Creates and opens a file if none already open. One use of this method
* then is after construction, call this method to add the metadata, then
* call {@link #getPosition()} to find offset of first record.
*
* TODO: perhaps this should be called checkForNewOpen? because it also
* handles initial open, even when not rolling oversize
*
* @exception IOException
*/
public void checkSize() throws IOException {
if (this.out == null || isOversize()) {
createFile();
}
}
/** Check if underlying file has already reached its target size.
* @return boolean true if file has reached target size and due to be closed
*/
public boolean isOversize() {
return settings.getMaxFileSizeBytes() != -1 && (this.getPosition() > settings.getMaxFileSizeBytes());
}
/**
* Create a new file.
* Rotates off the current Writer and creates a new in its place
* to take subsequent writes. Usually called from {@link #checkSize()}.
* @return Name of file created.
* @throws IOException
*/
protected String createFile() throws IOException {
generateNewBasename();
String name = currentBasename + '.' + this.extension +
((settings.getCompress())? DOT_COMPRESSED_FILE_EXTENSION: "") +
OCCUPIED_SUFFIX;
File dir = getNextDirectory(settings.calcOutputDirs());
return createFile(new File(dir, name));
}
protected String createFile(final File file) throws IOException {
close();
this.f = file;
FileOutputStream fos = new FileOutputStream(this.f);
this.countOut = new MiserOutputStream(new FastBufferedOutputStream(fos),settings.getFrequentFlushes());
this.out = this.countOut;
logger.fine("Opened " + this.f.getAbsolutePath());
return this.f.getName();
}
/**
* @param dirs List of File objects that point at directories.
* @return Find next directory to write an arc too. If more
* than one, it tries to round-robin through each in turn.
* @throws IOException
*/
protected File getNextDirectory(List dirs)
throws IOException {
if (WriterPoolMember.roundRobinIndex >= dirs.size()) {
WriterPoolMember.roundRobinIndex = 0;
}
File d = null;
try {
d = checkWriteable((File)dirs.
get(WriterPoolMember.roundRobinIndex));
} catch (IndexOutOfBoundsException e) {
// Dirs list might be altered underneath us.
// If so, we get this exception -- just keep on going.
}
if (d == null && dirs.size() > 1) {
for (Iterator i = dirs.iterator(); d == null && i.hasNext();) {
d = checkWriteable((File)i.next());
}
} else {
WriterPoolMember.roundRobinIndex++;
}
if (d == null) {
throw new IOException("Directories unusable.");
}
return d;
}
protected File checkWriteable(File d) {
if (d == null) {
return d;
}
try {
FileUtils.ensureWriteableDirectory(d);
} catch(IOException e) {
logger.warning("Directory " + d.getPath() + " is not" +
" writeable or cannot be created: " + e.getMessage());
d = null;
}
return d;
}
/**
* Generate a new basename by interpolating values in the configured
* template. Values come from local state, other configured values, and
* global system properties. The recommended default template will
* generate a unique basename under reasonable assumptions.
*/
protected void generateNewBasename() {
Properties localProps = new Properties();
localProps.setProperty("prefix", settings.getPrefix());
synchronized(this.getClass()) {
// ensure that serialNo and timestamp are minted together (never inverted sort order)
String paddedSerialNumber = WriterPoolMember.serialNoFormatter.format(serialNo.getAndIncrement());
String timestamp17 = ArchiveUtils.getUnique17DigitDate();
String timestamp14 = ArchiveUtils.getUnique14DigitDate();
currentTimestamp = timestamp17;
localProps.setProperty("serialno", paddedSerialNumber);
localProps.setProperty("timestamp17", timestamp17);
localProps.setProperty("timestamp14", timestamp14);
}
currentBasename = PropertyUtils.interpolateWithProperties(settings.getTemplate(),
localProps, System.getProperties());
}
/**
* Get the file name
*
* @return the filename, as if uncompressed
*/
protected String getBaseFilename() {
String name = this.f.getName();
if (settings.getCompress() && name.endsWith(DOT_COMPRESSED_FILE_EXTENSION)) {
return name.substring(0,name.length() - 3);
} else if(settings.getCompress() &&
name.endsWith(DOT_COMPRESSED_FILE_EXTENSION +
OCCUPIED_SUFFIX)) {
return name.substring(0, name.length() -
(3 + OCCUPIED_SUFFIX.length()));
} else {
return name;
}
}
/**
* Get this file.
*
* Used by junit test to test for creation and when {@link WriterPool} wants
* to invalidate a file.
*
* @return The current file.
*/
public File getFile() {
return this.f;
}
/**
* Post write tasks.
*
* Has side effects. Will open new file if we're at the upper bound.
* If we're writing compressed files, it will wrap output stream with a
* GZIP writer with side effect that GZIP header is written out on the
* stream.
*
* @exception IOException
*/
protected void preWriteRecordTasks()
throws IOException {
if (this.out == null) {
createFile();
}
if (settings.getCompress()) {
// Wrap stream in GZIP Writer.
// The below construction immediately writes the GZIP 'default'
// header out on the underlying stream.
this.out = new CompressedStream(this.out);
}
}
/**
* Post file write tasks.
* If compressed, finishes up compression and flushes stream so any
* subsequent checks get good reading.
*
* @exception IOException
*/
protected void postWriteRecordTasks()
throws IOException {
if (settings.getCompress()) {
CompressedStream o = (CompressedStream)this.out;
o.finish();
o.flush();
o.end();
this.out = o.getWrappedStream();
}
}
/**
* Position in raw output (typically, physical file).
* Used making accounting of bytes written.
* @return Position in final media (assuming all flushing completes)
* @throws IOException
*/
public long getPosition() {
return (countOut==null)? 0L : this.countOut.getCount();
}
public boolean isCompressed() {
return settings.getCompress();
}
protected void write(final byte [] b) throws IOException {
this.out.write(b);
}
protected void flush() throws IOException {
this.out.flush();
}
protected void write(byte[] b, int off, int len) throws IOException {
this.out.write(b, off, len);
}
protected void write(int b) throws IOException {
this.out.write(b);
}
/**
* Copy bytes from the provided InputStream to the target file/stream being
* written.
*
* @return number of bytes written (normally equal to {@code enforceLength})
* @param is
* InputStream to copy bytes from
* @param recordLength
* expected number of bytes to copy
* @param enforceLength
* whether to throw an exception if too many/too few bytes are
* available from stream
* @throws IOException
*/
protected long copyFrom(final InputStream is, final long recordLength,
boolean enforceLength) throws IOException {
int read = scratchbuffer.length;
long tot = 0;
while ((tot < recordLength)
&& (read = is.read(scratchbuffer)) != -1) {
int write = read;
// never write more than enforced length
write = (int) Math.min(write, recordLength - tot);
tot += read;
write(scratchbuffer, 0, write);
}
if (enforceLength && tot != recordLength) {
// throw exception if desired for read vs. declared mismatches
throw new IOException("Read " + tot + " but expected "
+ recordLength);
}
return tot;
}
public void close() throws IOException {
if (this.out == null) {
return;
}
this.out.close();
this.out = null;
if (this.f != null && this.f.exists()) {
String path = this.f.getAbsolutePath();
if (path.endsWith(OCCUPIED_SUFFIX)) {
File f = new File(path.substring(0,
path.length() - OCCUPIED_SUFFIX.length()));
if (f.exists() & !f.delete()) {
logger.warning("Failed delete of " + f);
}
if (!this.f.renameTo(f)) {
logger.warning("Failed rename of " + path);
}
this.f = f;
}
logger.fine("Closed " + this.f.getAbsolutePath() +
", size " + this.f.length());
}
}
protected OutputStream getOutputStream() {
return this.out;
}
/**
* An override so we get access to underlying output stream.
* and offer an end() that does not accompany closing underlying
* stream.
* @author stack
*/
private class CompressedStream extends GZIPOutputStream {
public CompressedStream(OutputStream out)
throws IOException {
super(out);
}
/**
* @return Reference to stream being compressed.
*/
OutputStream getWrappedStream() {
return this.out;
}
/**
* Release the deflater's native process resources,
* which otherwise would not occur until either
* finalization or DeflaterOutputStream.close()
* (which would also close underlying stream).
*/
public void end() {
def.end();
}
}
}