org.archive.io.GenericReplayCharSequence Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;
import org.archive.util.DevUtils;
import com.google.common.base.Charsets;
import com.google.common.primitives.Ints;
/**
* (Replay)CharSequence view on recorded streams.
*
* For small streams, use {@link InMemoryReplayCharSequence}.
*
* Call {@link close()} on this class when done to clean up resources.
*
* @contributor stack
* @contributor nlevitt
* @version $Revision$, $Date$
*/
public class GenericReplayCharSequence implements ReplayCharSequence {
protected static Logger logger = Logger
.getLogger(GenericReplayCharSequence.class.getName());
/**
* Name of the encoding we use writing out concatenated decoded prefix
* buffer and decoded backing file.
*
*
This define is also used as suffix for the file that holds the
* decodings. The name of the file that holds the decoding is the name
* of the backing file w/ this encoding for a suffix.
*
*
See Encoding.
*/
public static final Charset WRITE_ENCODING = Charsets.UTF_16BE;
private static final long MAP_MAX_BYTES = 64 * 1024 * 1024; // 64M
/**
* When the memory map moves away from the beginning of the file
* (to the "right") in order to reach a certain index, it will
* map up to this many bytes preceding (to the left of) the target character.
* Consequently it will map up to
* MAP_MAX_BYTES - MAP_TARGET_LEFT_PADDING
* bytes to the right of the target.
*/
private static final long MAP_TARGET_LEFT_PADDING_BYTES = (long) (MAP_MAX_BYTES * 0.01);
/**
* Total length of character stream to replay minus the HTTP headers
* if present.
*
* If the backing file is larger than Integer.MAX_VALUE
(i.e. 2gb),
* only the first Integer.MAX_VALUE
characters are available through this API.
* We're overriding java.lang.CharSequence
so that we can use
* java.util.regex
directly on the data, and the CharSequence
* API uses int
for the length and index.
*/
protected int length;
/** counter of decoding exceptions for report at end */
protected long decodingExceptions = 0;
protected CharacterCodingException codingException = null;
/**
* Byte offset into the file where the memory mapped portion begins.
*/
private long mapByteOffset;
// XXX do we need to keep the input stream around?
private FileInputStream backingFileIn = null;
private FileChannel backingFileChannel = null;
private long bytesPerChar;
private CharBuffer mappedBuffer = null;
/**
* File that has decoded content.
*
* Keep it around so we can remove on close.
*/
private File decodedFile = null;
/*
* This portion of the CharSequence precedes what's in the backing file. In
* cases where we decodeToFile(), this is always empty, because we decode
* the entire input stream.
*/
private CharBuffer prefixBuffer = null;
private boolean isOpen = true;
protected Charset charset = null;
/**
* Constructor.
*
* @param contentReplayInputStream inputStream of content
* @param charset Encoding to use reading the passed prefix
* buffer and backing file. Must not be null.
* @param backingFilename Path to backing file with content in excess of
* whats in buffer
.
*
* @throws IOException
*/
public GenericReplayCharSequence(InputStream contentReplayInputStream,
int prefixMax,
String backingFilename,
Charset charset) throws IOException {
super();
logger.fine("characterEncoding=" + charset + " backingFilename="
+ backingFilename);
if(charset==null) {
charset = ReplayCharSequence.FALLBACK_CHARSET;
}
// decodes only up to Integer.MAX_VALUE characters
decode(contentReplayInputStream, prefixMax, backingFilename, charset);
this.bytesPerChar = 2;
if(length>prefixBuffer.position()) {
this.backingFileIn = new FileInputStream(decodedFile);
this.backingFileChannel = backingFileIn.getChannel();
this.mapByteOffset = 0;
updateMemoryMappedBuffer();
}
}
private void updateMemoryMappedBuffer() {
long charLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
long mapSize = Math.min((charLength * bytesPerChar) - mapByteOffset, MAP_MAX_BYTES);
logger.fine("updateMemoryMappedBuffer: mapOffset="
+ NumberFormat.getInstance().format(mapByteOffset)
+ " mapSize=" + NumberFormat.getInstance().format(mapSize));
try {
// TODO: stress-test without these possibly-costly requests!
// System.gc();
// System.runFinalization();
// TODO: Confirm the READ_ONLY works. I recall it not working.
// The buffers seem to always say that the buffer is writable.
mappedBuffer = backingFileChannel.map(
FileChannel.MapMode.READ_ONLY, mapByteOffset, mapSize)
.asReadOnlyBuffer().asCharBuffer();
} catch (IOException e) {
// TODO convert this to a runtime error?
DevUtils.logger.log(Level.SEVERE,
" backingFileChannel.map() mapByteOffset=" + mapByteOffset
+ " mapSize=" + mapSize + "\n" + "decodedFile="
+ decodedFile + " length=" + length + "\n"
+ DevUtils.extraInfo(), e);
throw new RuntimeException(e);
}
}
/**
* Converts the first Integer.MAX_VALUE
characters from the
* file backingFilename
from encoding encoding
to
* encoding WRITE_ENCODING
and saves as
* this.decodedFile
, which is named backingFilename
* + "." + WRITE_ENCODING
.
*
* @throws IOException
*/
protected void decode(InputStream inStream, int prefixMax,
String backingFilename, Charset charset) throws IOException {
this.charset = charset;
// TODO: consider if BufferedReader is helping any
// TODO: consider adding TBW 'LimitReader' to stop reading at
// Integer.MAX_VALUE characters because of charAt(int) limit
BufferedReader reader = new BufferedReader(new InputStreamReader(
inStream, charset));
logger.fine("backingFilename=" + backingFilename + " encoding="
+ charset + " decodedFile=" + decodedFile);
this.prefixBuffer = CharBuffer.allocate(prefixMax);
long count = 0;
while(count < prefixMax) {
int read = reader.read(prefixBuffer);
if(read<0) {
break;
}
count += read;
}
int ch = reader.read();
if(ch >= 0) {
count++;
// more to decode to file overflow
this.decodedFile = new File(backingFilename + "." + WRITE_ENCODING);
FileOutputStream fos;
try {
fos = new FileOutputStream(this.decodedFile);
} catch (FileNotFoundException e) {
// Windows workaround attempt
System.gc();
System.runFinalization();
this.decodedFile = new File(decodedFile.getAbsolutePath()+".win");
logger.info("Windows 'file with a user-mapped section open' "
+ "workaround gc/finalization/name-extension performed.");
// try again
fos = new FileOutputStream(this.decodedFile);
}
Writer writer = new OutputStreamWriter(fos,WRITE_ENCODING);
writer.write(ch);
count += IOUtils.copyLarge(reader, writer);
writer.close();
reader.close();
}
this.length = Ints.saturatedCast(count);
if(count>Integer.MAX_VALUE) {
logger.warning("input stream is longer than Integer.MAX_VALUE="
+ NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ " characters -- only first "
+ NumberFormat.getInstance().format(Integer.MAX_VALUE)
+ " are accessible through this GenericReplayCharSequence");
}
logger.fine("decode: decoded " + count + " characters" +
((decodedFile==null) ? ""
: " ("+(count-prefixBuffer.length())+" to "+decodedFile+")"));
}
/**
* Get character at passed absolute position.
* @param index Index into content
* @return Character at offset index
.
*/
public char charAt(int index) {
if (index < 0 || index >= this.length()) {
throw new IndexOutOfBoundsException("index=" + index
+ " - should be between 0 and length()=" + this.length());
}
// is it in the buffer
if (index < prefixBuffer.limit()) {
return prefixBuffer.get(index);
}
// otherwise we gotta get it from disk via memory map
long charFileIndex = (long) index - (long) prefixBuffer.limit();
long charFileLength = (long) this.length() - (long) prefixBuffer.limit(); // in characters
if (charFileIndex * bytesPerChar < mapByteOffset) {
logger.log(Level.WARNING,"left-fault; probably don't want to use CharSequence that far backward");
}
if (charFileIndex * bytesPerChar < mapByteOffset
|| charFileIndex - (mapByteOffset / bytesPerChar) >= mappedBuffer.limit()) {
// fault
/*
* mapByteOffset is bounded by 0 and file size +/- size of the map,
* and starts as close to fileIndex -
* MAP_TARGET_LEFT_PADDING_BYTES
as it can while also not
* being smaller than it needs to be.
*/
mapByteOffset = Math.min(charFileIndex * bytesPerChar - MAP_TARGET_LEFT_PADDING_BYTES,
charFileLength * bytesPerChar - MAP_MAX_BYTES);
mapByteOffset = Math.max(0, mapByteOffset);
updateMemoryMappedBuffer();
}
return mappedBuffer.get((int)(charFileIndex-(mapByteOffset/bytesPerChar)));
}
public CharSequence subSequence(int start, int end) {
return new CharSubSequence(this, start, end);
}
private void deleteFile(File fileToDelete) {
deleteFile(fileToDelete, null);
}
private void deleteFile(File fileToDelete, final Exception e) {
if (e != null) {
// Log why the delete to help with debug of
// java.io.FileNotFoundException:
// ....tt53http.ris.UTF-16BE.
logger.severe("Deleting " + fileToDelete + " because of "
+ e.toString());
}
if (fileToDelete != null && fileToDelete.exists()) {
logger.fine("deleting file: " + fileToDelete);
fileToDelete.delete();
}
}
@Override
public boolean isOpen() {
return this.isOpen;
}
public void close() throws IOException {
this.isOpen = false;
logger.fine("closing");
if (this.backingFileChannel != null && this.backingFileChannel.isOpen()) {
this.backingFileChannel.close();
}
if (backingFileIn != null) {
backingFileIn.close();
}
deleteFile(this.decodedFile);
// clear decodedFile -- so that double-close (as in finalize()) won't
// delete a later instance with same name see bug [ 1218961 ]
// "failed get of replay" in ExtractorHTML... usu: UTF-16BE
this.decodedFile = null;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#finalize()
*/
protected void finalize() throws Throwable {
super.finalize();
logger.fine("finalizing");
close();
}
/**
* Convenience method for getting a substring.
*
* @deprecated please use subSequence() and then toString() directly
*/
public String substring(int offset, int len) {
return subSequence(offset, offset + len).toString();
}
public String toString() {
StringBuilder sb = new StringBuilder(this.length());
sb.append(this);
return sb.toString();
}
public int length() {
return length;
}
/* (non-Javadoc)
* @see org.archive.io.ReplayCharSequence#getDecodeExceptionCount()
*/
@Override
public long getDecodeExceptionCount() {
return decodingExceptions;
}
/* (non-Javadoc)
* @see org.archive.io.ReplayCharSequence#getCodingException()
*/
@Override
public CharacterCodingException getCodingException() {
return codingException;
}
/* (non-Javadoc)
* @see org.archive.io.ReplayCharSequence#getCharset()
*/
public Charset getCharset() {
return charset;
}
}