
org.apache.daffodil.io.InputSource.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.daffodil.io
import java.nio.ByteBuffer
import scala.collection.mutable.ArrayBuffer
import org.apache.daffodil.exceptions.Assert
import org.apache.daffodil.exceptions.ThinThrowable
/**
* There is a finite limit to the distance one can backtrack which is given by
* the implementation's use of a finite array of finite fixed size buckets. If a
* vast amount of data is read in and is more than the ultimate backtrack limit
* (number of buckets times bucket size) in size, then that will cause data
* buckets to effectively spill off into parser history, and the ability to
* backtrack to the points in the data they stored is lost along with them. Any
* time there is a point-of-uncertainty to which the parser could backtrack, and
* then the parser advances through more data than the backtracking maximum
* limit, the ability to backtrack to that point of uncertainty will be lost.
* This will be detected, and is a fatal error (Runtime SDE).
*
* This situation is easiest to envision if BLOB objects are involved, but is
* not BLOB specific. A format with a choice, then in the first branch of the
* choice, a group and/or array of small data items, ultimately totaling in size
* to more than the backtrack limit, and then a branch failure, will cause this
* backtracking limit error with no BLOBs being used.
*
* There is no limit created by this module of code to the size of a any data
* item having to fit within JVM byte array maximums, nor the JVM memory
* footprint even. Only a limitation on backtracking distance is created.
*
* It is also worth noting that it is not the act of backtracking that directly
* cause an error. The error only occurs when someone tries to read from a
* bucket that has already been released.
*/
case class BacktrackingException(position: Long, maxBacktrackLength: Int)
extends Exception("Attempted to backtrack to byte %d, which exceeds maximum backtrack length of %d".format(position, maxBacktrackLength))
with ThinThrowable
/**
* The InputSource class is really just a mechanism to provide bytes an
* InputSourceDataInputStream, which does the heavily lift about converter
* bits/bytes to numbers and characters. This class does not need to know
* anything about bits, it is purely byte centric. One core difference from
* this vs an InputStream is that is must have the capability to backtrack to
* arbitrary points in the InputStreams history. To aide in this, methods are
* called to let the InputSource know which byte positions might we might need
* to backtrack to, which can allow it to free data that know longer is needed.
* One can almost thing of things as an InputStream that supports multiple
* marks with random access.
*/
abstract class InputSource {
/**
* Determine whether the underlying data has the specified number of bytes
* available starting at the current byte position. This function must block
* until either nBytes are known to be available or end-of-file is reached.
* This does not advance the current position.
*
* @param nBytes the number of bytes to determine if are available
* @return true if nBytes are available, false otherwise
*/
def areBytesAvailable(nBytes: Long): Boolean
/**
* Return the number of currently available bytes.
*
* This should not be used to determine the length of the data, as more bytes
* may become available in the future. This should really only be used for
* debug purposes.
*/
def bytesAvailable(): Long
/**
* Return a single byte at the current byte position with a value in the
* range of 0 to 255.
*
* Increments the current byte position if successful.
*
* @return the byte at the current byte position if it exists, or -1 if EOF is reached.
*/
def get(): Int
/**
* Return a byte array with data from the current byte position.
*
* Stores the next len bytes of data in dest starting at index off. In len
* bytes are not available or len bytes cannot fit in the dest array starting
* at the given offset, the dest array is not modified and false is returned.
*
* @return true if len bytes are available and written to the dest array,
* false otherwise
*/
def get(dest: Array[Byte], off: Int, len: Int): Boolean
/**
* Get the current byte position, using zero-based indexing
*
* @return the current byte position
*/
def position(): Long
/**
* Set the current byte position, using zero-based indexing
*
* bytPos0b cannot be greater than the most recent read data. In other words,
* this can only be used to move backwards in data.
*
* @param bytePos0b the new current byte position
*/
def position(bytePos0b: Long): Unit
/**
* Set the specified byte position as a location that that one may want to
* call setPosition in the future. This is essentially setting a mark in the
* data that can be reset back to later. Implementations are allowed to free
* any bytes before a locked byte position. Any bytes after a locked position
* cannot be freed until that lock is release.
*
* Note that this "lock" has nothing to do with synchronization, but behaves
* more like marks that must be accessable until released.
*
* @param bytePos0b the byte position to lock
*/
def lockPosition(bytePos0b: Long): Unit
/**
* Release a previously locked byte position, allowing the implementation to
* free any unlocked bytes.
*
* @param bytePos0b the byte position to release
*/
def releasePosition(bytePos0b: Long): Unit
/**
* Alerts the implementation to attempt to free data that is no longer used,
* if possible. If possible, this should free any unlocked bytes.
*/
def compact(): Unit
private var _debugging = false
protected var _isValid = true
final def areDebugging: Boolean = _debugging
final def isValid: Boolean = _isValid
final def setInvalid: Unit = { _isValid = false }
final def setDebugging(setting: Boolean): Unit = _debugging = setting
}
/**
* Implements the InputSource interface, reading data from a generic
* java.io.InputStream and storing the data in buckets of a defined size.
* Buckets are freed when no "locks" exist inside the bucket to minimize memory
* usage. Note that "locks" in this sense are the InputSource locks on
* bytePosition and are not about syncrhonization. This more of a reference
* count, counting buckets to determine which buckets are no longer needed and
* can be freed when the reference count goes to zero.
*
* @param inputStream the java.io.Inputstream to read data from
* @param bucketSize the size of each individual bucket
* @param maxCacheSizeInBytes the max memory allowed to be used for bucket storage (num buckets * bucketSize)
*/
class BucketingInputSource(
inputStream: java.io.InputStream,
bucketSize: Int = 1 << 13,
maxCacheSizeInBytes: Int = 256 * (1 << 20))
extends InputSource {
private class Bucket {
var refCount = 0
val bytes = new Array[Byte](bucketSize)
}
private val maxNumberOfBuckets = Math.max(maxCacheSizeInBytes / bucketSize, 2)
/**
* Array of buckets
*/
private val buckets = {
val b = new ArrayBuffer[Bucket]()
// initialize with an empty bucket
b += new Bucket()
b
}
/**
* Used to keep track of the oldest bucket stored in the bucket array. When a
* bucket is released, this number will be incremented, allowing us to
* quickly know which buckets have something in them. All indices in the
* buckets ArrayBuffer less than this index should have a value of null
*/
private var oldestBucketIndex = 0
/**
* Stores the current byte position. When a call to get is made, the
* byte(s) at this position is retrieved and this value is incremented.
*/
private var curBytePosition0b: Long = 0
/**
* Contains the total number of bytes that have been read and stored in a
* bucket.
*/
private var totalBytesBucketed: Long = 0
/**
* Stores the byte position represented by the 0th index of the 0th bucket.
* This implementation will periodically throw away old unneeded buckets and
* move the remaining buckets to the front of the buckets ArrayBuffer. This
* value is used to determine which bucket to grab data from, essentially
* acting as an offset of curBytePosition0b.
*/
private var headBucketBytePosition0b: Long = 0
/**
* Stores the number of bytes that have been filled in the last bucket. A
* bucket is allocated to bucketSize, but not all may be used if the source
* InputStream did not give us enough or ran out of data. This says how much
* data was in the last bucket in such a case.
*/
private var bytesFilledInLastBucket: Int = 0
/**
* Adds new buckets to the buckets array until either we run out of data or
* we fill up to byteIndex bytes in the bucketIndex. This modifies
* bytesFilledInLastBucket, which may be less than bucketSize if the last
* bucket was not completely filled.
*
* @return false if EOF was hit before filling in the necessary bytes. true otherwise.
*/
private def fillBucketsToIndex(goalBucketIndex: Long, bytesNeededInBucket: Long): Boolean = {
var lastBucketIndex = buckets.length - 1
var hasMoreData = true
var needsMoreData = goalBucketIndex > lastBucketIndex || (goalBucketIndex == lastBucketIndex && bytesNeededInBucket > bytesFilledInLastBucket)
while (needsMoreData && hasMoreData) {
// Try to fill the rest of this bucket, regardless of how many bytes are
// actually needed
val emptyBytesInLastBucket = bucketSize - bytesFilledInLastBucket
// Try to read enough bytes to fill the rest of this bucket. Note that
// the .read() function could hit EOF (returns -1) or could return
// anywhere from zero to emptyBytesInLastBucket bytes
val bytesRead = inputStream.read(buckets(lastBucketIndex).bytes, bytesFilledInLastBucket, emptyBytesInLastBucket)
if (bytesRead == -1) {
// Needed more data but hit EOF, break out with error
hasMoreData = false
} else {
totalBytesBucketed += bytesRead
bytesFilledInLastBucket += bytesRead
if (bytesFilledInLastBucket == bucketSize) {
// We completely filled in this bucket, let's create a new bucket for
// the next time this inner loop is hit (might be immediately if we
// didn't reach the target bucket, or might be the next time this
// function is called if we did). Increment index so that if we do
// continue this loop immediately, we're filling in the new bucket.
buckets += new Bucket()
bytesFilledInLastBucket = 0
lastBucketIndex += 1
if (buckets.size > maxNumberOfBuckets) {
// This frees the oldest bucket, allowing it to be garbage collected.
buckets(oldestBucketIndex.toInt) = null
oldestBucketIndex += 1
}
}
if (( lastBucketIndex == goalBucketIndex &&
bytesNeededInBucket <= bytesFilledInLastBucket) ||
lastBucketIndex > goalBucketIndex) {
// Filled data at least to the target byte in the target bucket
// We are done
needsMoreData = false
} else {
// Either we didn't get to the target bucket, or we did but we didn't
// fill in enough bytes in the target bucket. In either case, we need
// more bytes in this bucket, so loop around again using the new
// bucket index and bytesFilledInLastBucket, and try to fill it up.
// Nothing needs to be done here.
}
}
}
// this function succeeds if we got to a point where we no longer need
// anymore data (i.e. we filled in at least needed bytes in the specified
// goalBucketIndex).
!needsMoreData
}
@inline
final private def bytePositionToIndicies(bytePos0b: Long): (Long, Long) = {
val offsetBytePosition0b = bytePos0b - headBucketBytePosition0b
val bucketIndex = offsetBytePosition0b / bucketSize
val byteIndex = offsetBytePosition0b % bucketSize
(bucketIndex, byteIndex)
}
/**
* Determines whether the input stream has nBytes more bytes available
* starting at the current byte position. Read bytes are cached in buckets.
* Blocks until either nBytes are known to be available or EOF is reached.
* Does not advance the current byte position.
*/
def areBytesAvailable(nBytes: Long): Boolean = {
val finalBytePosition0b = curBytePosition0b + nBytes
if (finalBytePosition0b <= totalBytesBucketed) {
true
} else {
val (bucketIndex, byteIndex) = bytePositionToIndicies(finalBytePosition0b)
Assert.invariant(bucketIndex >= oldestBucketIndex)
val filled = fillBucketsToIndex(bucketIndex, byteIndex)
filled
}
}
/**
* Calculate how many bytes are currently buffered starting from the current
* position
*/
def bytesAvailable(): Long = {
var available = 0L
val (curBucketIndex, curByteIndex) = bytePositionToIndicies(curBytePosition0b)
var i = curBucketIndex
while (i < buckets.length) {
val startByteIndex = if (i == curBucketIndex) curByteIndex else 0
val endByteIndex = if (i == buckets.length - 1) bytesFilledInLastBucket else bucketSize
val bytesAvailableInBucket = endByteIndex - startByteIndex
available += bytesAvailableInBucket
i += 1
}
available
}
/**
* Return a single byte at the current byte position. Increments
* curBytePosition0b if successful. The Byte is return as an integer in the
* range 0 to 255. Returns -1 if EOF is reached.
*/
def get(): Int = {
val hasByte = areBytesAvailable(1)
if (!hasByte) {
-1
} else {
val (bucketIndex, byteIndex) = bytePositionToIndicies(curBytePosition0b)
if ((bucketIndex < 0) || (buckets(bucketIndex.toInt) == null))
throw new BacktrackingException(curBytePosition0b, maxCacheSizeInBytes)
val byte = buckets(bucketIndex.toInt).bytes(byteIndex.toInt)
curBytePosition0b += 1
byte & 0xFF
}
}
/**
* Return a byte array with data from the current byte position. Stores the
* next len bytes of data in dest starting at index off. Returns true if len
* bytes are available, false otherwise and writes nothing to dest.
*/
def get(dest: Array[Byte], off: Int, len: Int): Boolean = {
Assert.invariant(dest.length - off >= len)
val hasBytes = areBytesAvailable(len)
if (!hasBytes) {
false
} else {
var (bucketIndex, byteIndex) = bytePositionToIndicies(curBytePosition0b)
var bytesStillToGet = len
var destOffset = off
while (bytesStillToGet > 0) {
val bytesToGetFromCurrentBucket = Math.min(bucketSize - byteIndex, bytesStillToGet).toInt
if ((bucketIndex < 0) || (buckets(bucketIndex.toInt) == null))
throw new BacktrackingException(curBytePosition0b, maxCacheSizeInBytes)
Array.copy(buckets(bucketIndex.toInt).bytes, byteIndex.toInt, dest, destOffset, bytesToGetFromCurrentBucket)
destOffset += bytesToGetFromCurrentBucket
bytesStillToGet -= bytesToGetFromCurrentBucket
// next read will read from the next bucket
bucketIndex += 1
byteIndex = 0
}
curBytePosition0b += len
true
}
}
def position(): Long = curBytePosition0b
def position(bytePos0b: Long): Unit = {
val (bucketIndex, _) = bytePositionToIndicies(bytePos0b)
Assert.invariant(bucketIndex < buckets.length)
curBytePosition0b = bytePos0b
}
def lockPosition(bytePos0b: Long): Unit = {
val (bucketIndex, _) = bytePositionToIndicies(bytePos0b)
Assert.invariant(bucketIndex < buckets.length)
if (buckets(bucketIndex.toInt) != null)
buckets(bucketIndex.toInt).refCount += 1
}
def releasePosition(bytePos0b: Long): Unit = {
val (bucketIndex, _) = bytePositionToIndicies(bytePos0b)
if (buckets(bucketIndex.toInt) != null) {
Assert.invariant(bucketIndex >= oldestBucketIndex && bucketIndex < buckets.length)
buckets(bucketIndex.toInt).refCount -= 1
}
if (buckets(oldestBucketIndex.toInt).refCount == 0) {
// We just freed the last reference to the oldest bucket (or the oldest
// bucket happened to have no references). So try to release as many
// buckets as possible. Note that this might still not release anything
// if the oldest bucket contains the current byte position.
releaseBuckets()
}
}
private def releaseBuckets(): Unit = {
// we only want to release buckets when not debugging so that data is never
// cleaned up and is always available
if (!areDebugging) {
// Look for old buckets that are no longer referenced by a mark (i.e.
// refCount is zero), set them to null so they are garbage collected. Make
// sure not to remove whatever bucket holds the current byte position,
// even if there are no marks--we need to still read from that bucket.
val (curBucketIndex, _) = bytePositionToIndicies(curBytePosition0b)
while (oldestBucketIndex < curBucketIndex && buckets(oldestBucketIndex).refCount == 0) {
buckets(oldestBucketIndex) = null
oldestBucketIndex += 1
}
}
}
/**
* This should be called when at the end of a parse when all marks have been
* released, or just called periodically to ensure the size of the
* buckets arraybuffer does not grow too big. This will move all existing
* buckets to the front of the buckets ArrayBuffer and update offsets
* accordingly. This way, there are not a bunch of null empty buckets at the
* front of the buckets array taking up space.
*/
def compact(): Unit = {
releaseBuckets()
buckets.remove(0, oldestBucketIndex.toInt)
val bytesRemoved = oldestBucketIndex * bucketSize
headBucketBytePosition0b += bytesRemoved
oldestBucketIndex = 0
}
}
/**
* Wraps a java.nio.ByteBuffer in a InputSource
*
* When an instance of this class is created, it creates a readOnly copy of the
* ByteBuffer. The current position of the ByteBuffer is considered index 0.
* For example, if thed passed in ByteBuffer had position 2, calling
* setPosition(0) would reset the byteBuffer back to position 2. The limit of
* the ByteBuffer is considered the end of data.
*/
class ByteBufferInputSource(byteBuffer: ByteBuffer)
extends InputSource {
private val bb = byteBuffer.asReadOnlyBuffer
private val positionOffset = bb.position()
def areBytesAvailable(nBytes: Long): Boolean = {
bb.remaining >= nBytes
}
def bytesAvailable(): Long = {
bb.remaining
}
def get(): Int = {
if (!areBytesAvailable(1)) {
-1
} else {
bb.get() & 0xFF
}
}
def get(dest: Array[Byte], off: Int, len: Int): Boolean = {
Assert.invariant(dest.length - off >= len)
if (!areBytesAvailable(len)) {
false
} else {
bb.get(dest, off, len)
true
}
}
def position(): Long = bb.position() - positionOffset
def position(bytePos0b: Long): Unit = bb.position((bytePos0b + positionOffset).toInt)
def lockPosition(bytePos0b: Long): Unit = {} // noop
def releasePosition(bytePos0b: Long): Unit = {} // noop
def compact(): Unit = { // noop
// Note that ByteBuffer.compact() does not do the same thing as this
// method. The point of this compact() method is to free up storage of
// bytes that no longer need to be used when we know we will never need to
// backtrack to them. However, the ByteBuffer.compact() method doesn't free
// allocated space, it just copies unread bytes to the beginning of the
// ByteBuffer so that more space is available for calls to put(). To meet
// the intention of compact() to free up space, we would need to allocate a
// smaller ByteBuffer and copy all unread bytes into that, and let the old one
// be garbage collected. For a large ByteBuffer with lots of unread data,
// that would be expensive. Furthermore, the original ByteBuffer would
// mostly likely still exist in memory created by the original caller, so
// this would just be using up more memory. So let's just do nothing here.
// Similarly, lock/releasePosition methods do not need to do anything,
// since those are only used to prevent memory segments from being
// compacted.
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy