com.fasterxml.aalto.util.TextBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
package com.fasterxml.aalto.util;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
import org.codehaus.stax2.typed.Base64Variant;
import org.codehaus.stax2.typed.TypedArrayDecoder;
import org.codehaus.stax2.typed.TypedXMLStreamException;
import org.codehaus.stax2.ri.typed.CharArrayBase64Decoder;
import com.fasterxml.aalto.in.ReaderConfig;
/**
* Class conceptually similar to {@link java.lang.StringBuilder}, but
* that allows for bit more efficient building, using segmented internal
* buffers, and direct access to these buffers.
*/
public final class TextBuilder
{
final static char[] sNoChars = new char[0];
/**
* Size of the first text segment buffer to allocate. Need not contain
* the biggest segment, since new ones will get allocated as needed.
* However, it's sensible to use something that often is big enough
* to contain typical segments.
*/
final static int DEF_INITIAL_BUFFER_SIZE = 500; // 1k
final static int MAX_SEGMENT_LENGTH = 256 * 1024;
final static int INT_SPACE = 0x0020;
// // // Configuration:
private final ReaderConfig _config;
// // // Internal non-shared collector buffers:
/**
* List of segments prior to currently active segment.
*/
private ArrayList _segments;
// // // Currently used segment; not (yet) contained in _segments
/**
* Amount of characters in segments in {@link _segments}
*/
private int _segmentSize;
private char[] _currentSegment;
/**
* Number of characters in currently active (last) segment
*/
private int _currentSize;
// // // Temporary caching for Objects to return
/**
* String that will be constructed when the whole contents are
* needed; will be temporarily stored in case asked for again.
*/
private String _resultString;
private char[] _resultArray;
/**
* Indicator for length of data with _resultArray
, iff
* the primary indicator (_currentSize) is invalid (-1).
*/
private int _resultLen;
/*
/**********************************************************************
/* Support for decoding, for Typed Access API
/**********************************************************************
*/
private char[] _decodeBuffer;
private int _decodePtr;
private int _decodeEnd;
/*
/**********************************************************************
/* Support for optimizating indentation segments:
/**********************************************************************
*/
/**
* Marker to know if the contents currently stored were created
* using "indentation detection". If so, it's known to be all
* white space
*/
private boolean _isIndentation = false;
// // // Canonical indentation objects (up to 32 spaces, 8 tabs)
public final static int MAX_INDENT_SPACES = 32;
public final static int MAX_INDENT_TABS = 8;
// Let's add one more space at the end, for safety...
private final static String sIndSpaces =
// 123456789012345678901234567890123
"\n ";
private final static char[] sIndSpacesArray = sIndSpaces.toCharArray();
private final static String[] sIndSpacesStrings = new String[sIndSpacesArray.length];
private final static String sIndTabs =
// 1 2 3 4 5 6 7 8 9
"\n\t\t\t\t\t\t\t\t\t";
private final static char[] sIndTabsArray = sIndTabs.toCharArray();
private final static String[] sIndTabsStrings = new String[sIndTabsArray.length];
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
private TextBuilder(ReaderConfig cfg)
{
_config = cfg;
}
public static TextBuilder createRecyclableBuffer(ReaderConfig cfg)
{
return new TextBuilder(cfg);
}
/**
* Method called to indicate that the underlying buffers should now
* be recycled if they haven't yet been recycled. Although caller
* can still use this text buffer, it is not advisable to call this
* method if that is likely, since next time a buffer is needed,
* buffers need to reallocated.
* Note: calling this method automatically also clears contents
* of the buffer.
*/
public void recycle(boolean force)
{
if (_config != null && _currentSegment != null) {
if (force) {
/* shouldn't call resetWithEmpty, as that would allocate
* initial buffer; but need to inline
*/
_resultString = null;
_resultArray = null;
} else {
/* But if there's non-shared data (ie. buffer is still
* in use), can't return it yet:
*/
if ((_segmentSize + _currentSize) > 0) {
return;
}
}
// If no data (or only shared data), can continue
if (_segments != null && _segments.size() > 0) {
// No need to use anything from list, curr segment not null
_segments.clear();
_segmentSize = 0;
}
char[] buf = _currentSegment;
_currentSegment = null;
_config.freeMediumCBuffer(buf);
}
}
/**
* Method called to clear out any content text buffer may have, and
* initializes and returns the first segment to add characters to.
*/
public char[] resetWithEmpty()
{
_resultString = null;
_resultArray = null;
_isIndentation = false;
// And then reset internal input buffers, if necessary:
if (_segments != null && _segments.size() > 0) {
/* Since the current segment should be the biggest one
* (as we allocate 50% bigger each time), let's retain it,
* and clear others
*/
_segments.clear();
_segmentSize = 0;
}
_currentSize = 0;
if (_currentSegment == null) {
_currentSegment = allocBuffer(0);
}
return _currentSegment;
}
public void resetWithIndentation(int indCharCount, char indChar)
{
// First reset internal input buffers, if necessary:
if (_segments != null && _segments.size() > 0) {
_segments.clear();
_segmentSize = 0;
}
_currentSize = -1;
_isIndentation = true;
String text;
int strlen = indCharCount+1;
_resultLen = strlen;
if (indChar == '\t') { // tabs?
_resultArray = sIndTabsArray;
text = sIndTabsStrings[indCharCount];
if (text == null) {
sIndTabsStrings[indCharCount] = text = sIndTabs.substring(0, strlen);
}
} else { // nope, spaces (should assert indChar?)
_resultArray = sIndSpacesArray;
text = sIndSpacesStrings[indCharCount];
if (text == null) {
sIndSpacesStrings[indCharCount] = text = sIndSpaces.substring(0, strlen);
}
}
_resultString = text;
}
/**
* Method called to initialize the buffer with just a single char
*/
public void resetWithChar(char c)
{
_resultString = null;
_resultArray = null;
_isIndentation = false;
// And then reset internal input buffers, if necessary:
if (_segments != null && _segments.size() > 0) {
_segments.clear();
_segmentSize = 0;
}
_currentSize = 1;
if (_currentSegment == null) {
_currentSegment = allocBuffer(1);
}
_currentSegment[0] = c;
}
public void resetWithSurrogate(int c)
{
_resultString = null;
_resultArray = null;
_isIndentation = false;
// And then reset internal input buffers, if necessary:
if (_segments != null && _segments.size() > 0) {
_segments.clear();
_segmentSize = 0;
}
_currentSize = 2;
if (_currentSegment == null) {
_currentSegment = allocBuffer(2);
}
_currentSegment[0] = (char) (0xD800 | (c >> 10));
_currentSegment[1] = (char) (0xDC00 | (c & 0x3FF));
}
public char[] getBufferWithoutReset()
{
return _currentSegment;
}
/*
/**********************************************************************
/* Accessors for implementing StAX interface:
/**********************************************************************
*/
/**
* @return Number of characters currently stored by this collector
*/
public int size()
{
int size = _currentSize;
// Will be -1 only if we have shared white space
if (size < 0) {
return _resultLen;
}
return size + _segmentSize;
}
public char[] getTextBuffer()
{
// Does it fit in just one segment?
if (_segments == null || _segments.size() == 0) {
// But is it whitespace, actually?
if (_resultArray != null) {
return _resultArray;
}
return _currentSegment;
}
// Nope, need to have/create a non-segmented array and return it
return contentsAsArray();
}
/*
/**********************************************************************
/* Accessors for text contained
/**********************************************************************
*/
public String contentsAsString()
{
if (_resultString == null) {
// Has array been requested? Can make a shortcut, if so:
if (_resultArray != null) {
_resultString = new String(_resultArray);
} else {
// Let's optimize common case: nothing in extra segments:
int segLen = _segmentSize;
int currLen = _currentSize;
if (segLen == 0) {
_resultString = (currLen == 0) ? "" : new String(_currentSegment, 0, currLen);
return _resultString;
}
// Nope, need to combine:
StringBuilder sb = new StringBuilder(segLen + currLen);
// First stored segments
if (_segments != null) {
for (int i = 0, len = _segments.size(); i < len; ++i) {
char[] curr = (char[]) _segments.get(i);
sb.append(curr, 0, curr.length);
}
}
// And finally, current segment:
sb.append(_currentSegment, 0, currLen);
_resultString = sb.toString();
}
}
return _resultString;
}
public char[] contentsAsArray()
{
char[] result = _resultArray;
if (result == null) {
_resultArray = result = buildResultArray();
}
return result;
}
public int contentsToArray(int srcStart, char[] dst, int dstStart, int len) {
/* Could also check if we have array, but that'd only help with
* brain dead clients that get full array first, then segments...
* which hopefully aren't that common
*/
// Copying from segmented array is bit more involved:
int totalAmount = 0;
if (_segments != null) {
for (int i = 0, segc = _segments.size(); i < segc; ++i) {
char[] segment = (char[]) _segments.get(i);
int segLen = segment.length;
int amount = segLen - srcStart;
if (amount < 1) { // nothing from this segment?
srcStart -= segLen;
continue;
}
if (amount >= len) { // can get rest from this segment?
System.arraycopy(segment, srcStart, dst, dstStart, len);
return (totalAmount + len);
}
// Can get some from this segment, offset becomes zero:
System.arraycopy(segment, srcStart, dst, dstStart, amount);
totalAmount += amount;
dstStart += amount;
len -= amount;
srcStart = 0;
}
}
// Need to copy anything from last segment?
if (len > 0) {
int maxAmount = _currentSize - srcStart;
if (len > maxAmount) {
len = maxAmount;
}
if (len > 0) { // should always be true
System.arraycopy(_currentSegment, srcStart, dst, dstStart, len);
totalAmount += len;
}
}
return totalAmount;
}
/**
* Method that will stream contents of this buffer into specified
* Writer.
*/
public int rawContentsTo(Writer w)
throws IOException
{
// Let's first see if we have created helper objects:
if (_resultArray != null) {
w.write(_resultArray);
return _resultArray.length;
}
if (_resultString != null) {
w.write(_resultString);
return _resultString.length();
}
// Nope, need to do full segmented output
int rlen = 0;
if (_segments != null) {
for (int i = 0, len = _segments.size(); i < len; ++i) {
char[] ch = (char[]) _segments.get(i);
w.write(ch);
rlen += ch.length;
}
}
if (_currentSize > 0) {
w.write(_currentSegment, 0, _currentSize);
rlen += _currentSize;
}
return rlen;
}
public boolean isAllWhitespace()
{
if (_isIndentation) {
return true;
}
// Need to do full segmented output, otherwise
if (_segments != null) {
for (int i = 0, len = _segments.size(); i < len; ++i) {
char[] buf = (char[]) _segments.get(i);
for (int j = 0, len2 = buf.length; j < len2; ++j) {
if (buf[j] > 0x0020) {
return false;
}
}
}
}
char[] buf = _currentSegment;
for (int i = 0, len = _currentSize; i < len; ++i) {
if (buf[i] > 0x0020) {
return false;
}
}
return true;
}
/**
* Method that can be used to check if the contents of the buffer end
* in specified String.
*
* @return True if the textual content buffer contains ends with the
* specified String; false otherwise
*/
public boolean endsWith(String str)
{
int segIndex = (_segments == null) ? 0 : _segments.size();
int inIndex = str.length() - 1;
char[] buf = _currentSegment;
int bufIndex = _currentSize-1;
while (inIndex >= 0) {
if (str.charAt(inIndex) != buf[bufIndex]) {
return false;
}
if (--inIndex == 0) {
break;
}
if (--bufIndex < 0) {
if (--segIndex < 0) { // no more data?
return false;
}
buf = (char[]) _segments.get(segIndex);
bufIndex = buf.length-1;
}
}
return true;
}
/**
* Note: it is assumed that this method is not used often enough to
* be a bottleneck, or for long segments. Based on this, it is optimized
* for common simple cases where there is only one single character
* segment to use; fallback for other cases is to create such segment.
*/
public boolean equalsString(String str)
{
int expLen = str.length();
// Otherwise, segments:
if (expLen != size()) {
return false;
}
char[] seg;
if (_segments == null || _segments.size() == 0) {
// just one segment, still easy
seg = _currentSegment;
} else {
/* Ok; this is the sub-optimal case. Could obviously juggle through
* segments, but probably not worth the hassle, we seldom if ever
* get here...
*/
seg = contentsAsArray();
}
for (int i = 0; i < expLen; ++i) {
if (seg[i] != str.charAt(i)) {
return false;
}
}
return true;
}
/*
/**********************************************************************
/* Methods for generating SAX events
/**********************************************************************
*/
/**
* This is a specialized "accessor" method, which is basically
* to fire SAX characters() events in an optimal way, based on
* which internal buffers are being used
*/
public void fireSaxCharacterEvents(ContentHandler h)
throws SAXException
{
if (_resultArray != null) { // only happens for indentation
h.characters(_resultArray, 0, _resultLen);
} else {
if (_segments != null) {
for (int i = 0, len = _segments.size(); i < len; ++i) {
char[] ch = (char[]) _segments.get(i);
h.characters(ch, 0, ch.length);
}
}
if (_currentSize > 0) {
h.characters(_currentSegment, 0, _currentSize);
}
}
}
public void fireSaxSpaceEvents(ContentHandler h)
throws SAXException
{
if (_resultArray != null) { // only happens for indentation
h.ignorableWhitespace(_resultArray, 0, _resultLen);
} else {
if (_segments != null) {
for (int i = 0, len = _segments.size(); i < len; ++i) {
char[] ch = (char[]) _segments.get(i);
h.ignorableWhitespace(ch, 0, ch.length);
}
}
if (_currentSize > 0) {
h.ignorableWhitespace(_currentSegment, 0, _currentSize);
}
}
}
public void fireSaxCommentEvent(LexicalHandler h)
throws SAXException
{
// Comment can not be split, so may need to combine the array
if (_resultArray != null) { // only happens for indentation
h.comment(_resultArray, 0, _resultLen);
} else if (_segments != null && _segments.size() > 0) {
char[] ch = contentsAsArray();
h.comment(ch, 0, ch.length);
} else {
h.comment(_currentSegment, 0, _currentSize);
}
}
/*
/**********************************************************************
/* Support for validation
/**********************************************************************
*/
/*
public void validateText(XMLValidator vld, boolean lastSegment)
throws XMLValidationException
{
// Can either create a combine buffer, or construct
// a String. While former could be more efficient, let's do latter
// for now since current validator implementations work better
// with Strings.
vld.validateText(contentsAsString(), lastSegment);
}
*/
/*
/**********************************************************************
/* Public mutators:
/**********************************************************************
*/
public void append(char c)
{
_resultString = null;
_resultArray = null;
// Room in current segment?
char[] curr = _currentSegment;
if (_currentSize >= curr.length) {
expand(1);
}
curr[_currentSize++] = c;
}
public void appendSurrogate(int surr)
{
append((char) (0xD800 | (surr >> 10)));
append((char) (0xDC00 | (surr & 0x3FF)));
}
public void append(char[] c, int start, int len)
{
_resultString = null;
_resultArray = null;
// Room in current segment?
char[] curr = _currentSegment;
int max = curr.length - _currentSize;
if (max >= len) {
System.arraycopy(c, start, curr, _currentSize, len);
_currentSize += len;
} else {
// No room for all, need to copy part(s):
if (max > 0) {
System.arraycopy(c, start, curr, _currentSize, max);
start += max;
len -= max;
}
/* And then allocate new segment; we are guaranteed to now
* have enough room in segment.
*/
expand(len); // note: curr != _currentSegment after this
System.arraycopy(c, start, _currentSegment, 0, len);
_currentSize = len;
}
}
public void append(String str)
{
_resultString = null;
_resultArray = null;
int len = str.length();
// Room in current segment?
char[] curr = _currentSegment;
int max = curr.length - _currentSize;
if (max >= len) {
str.getChars(0, len, curr, _currentSize);
_currentSize += len;
} else {
// No room for all, need to copy part(s):
if (max > 0) {
str.getChars(0, max, curr, _currentSize);
len -= max;
}
/* And then allocate new segment; we are guaranteed to now
* have enough room in segment.
*/
expand(len);
str.getChars(max, max+len, _currentSegment, 0);
_currentSize = len;
}
}
/*
/**********************************************************************
/* Raw access, for high-performance use:
/**********************************************************************
*/
public int getCurrentLength() {
return _currentSize;
}
public void setCurrentLength(int len) {
_currentSize = len;
}
public char[] finishCurrentSegment()
{
if (_segments == null) {
_segments = new ArrayList();
}
_segments.add(_currentSegment);
int oldLen = _currentSegment.length;
_segmentSize += oldLen;
char[] curr = new char[calcNewSize(oldLen)];
_currentSize = 0;
_currentSegment = curr;
return curr;
}
private int calcNewSize(int latestSize)
{
// Let's grow segments by 50%, when over 8k
int incr = (latestSize < 8000) ? latestSize : (latestSize >> 1);
int size = latestSize + incr;
// but let's not create too big chunks
return Math.min(size, MAX_SEGMENT_LENGTH);
}
/*
/**********************************************************************
/* Methods for implementing Typed Access API
/**********************************************************************
*/
/**
* Method called by the stream reader to decode space-separated tokens
* that are part of the current text event (contents of which
* are stored within this buffer), using given decoder.
*/
public int decodeElements(TypedArrayDecoder tad, boolean reset)
throws TypedXMLStreamException
{
if (reset) {
resetForDecode();
}
int ptr = _decodePtr;
final char[] buf = _decodeBuffer;
int count = 0;
// And then let's decode
int start = ptr;
try {
final int end = _decodeEnd;
decode_loop:
while (ptr < end) {
// First, any space to skip?
while (buf[ptr] <= INT_SPACE) {
if (++ptr >= end) {
break decode_loop;
}
}
// Then let's figure out non-space char (token)
start = ptr;
++ptr;
while (ptr < end && buf[ptr] > INT_SPACE) {
++ptr;
}
++count;
int tokenEnd = ptr;
++ptr; // to skip trailing space (or, beyond end)
// And there we have it
if (tad.decodeValue(buf, start, tokenEnd)) {
break;
}
_decodePtr = ptr;
}
_decodePtr = ptr;
} catch (IllegalArgumentException iae) {
// Need to convert to a checked stream exception to return lexical
// -1 to move it back after being advanced earlier (to skip trailing space)
String lexical = new String(buf, start, (ptr-start-1));
throw new TypedXMLStreamException(lexical, iae.getMessage(), iae);
}
return count;
}
/**
* Method called to initialize given base64 decoder with data
* contained in this text buffer (for the current event).
*/
public void resetForBinaryDecode(Base64Variant v, CharArrayBase64Decoder dec, boolean firstChunk)
{
// just one special case, indentation...
if (_segments == null || _segments.size() == 0) { // single segment
if (_isIndentation) { // but special one, indent/ws
dec.init(v, firstChunk, _resultArray, 0, _resultArray.length, null);
return;
}
}
dec.init(v, firstChunk, _currentSegment, 0, _currentSize, _segments);
}
private final void resetForDecode()
{
/* This is very similar to getTextBuffer(), except
* for assignment to _decodeXxx fields
*/
_decodePtr = 0;
if (_segments == null || _segments.size() == 0) { // single segment
if (_isIndentation) { // but special one, indent/ws
_decodeBuffer = _resultArray;
_decodeEnd = _resultArray.length;
} else { // nope, just a regular buffer
_decodeBuffer = _currentSegment;
_decodeEnd = _currentSize;
}
} else {
// Nope, need to have/create a non-segmented array and return it
_decodeBuffer = contentsAsArray();
_decodeEnd = _decodeBuffer.length;
}
}
/*
/**********************************************************************
/* Standard methods:
/**********************************************************************
*/
/**
* Note: calling this method may not be as efficient as calling
* {@link #contentsAsString}, since it is guaranteed that resulting
* String is NOT cached (to ensure we see no stale data)
*/
@Override
public String toString() {
_resultString = null;
_resultArray = null;
return contentsAsString();
}
/*
/**********************************************************************
/* Internal methods:
/**********************************************************************
*/
private final char[] allocBuffer(int minNeeded)
{
int size = Math.max(DEF_INITIAL_BUFFER_SIZE, minNeeded);
char[] buf = null;
if (_config != null) {
buf = _config.allocMediumCBuffer(size);
if (buf != null) {
return buf;
}
}
return new char[size];
}
/**
* Method called when current segment is full, to allocate new
* segment.
*/
private void expand(int roomNeeded)
{
// First, let's move current segment to segment list:
if (_segments == null) {
_segments = new ArrayList();
}
char[] curr = _currentSegment;
_segments.add(curr);
int oldLen = curr.length;
_segmentSize += oldLen;
int newSize = Math.max(roomNeeded, calcNewSize(oldLen));
curr = new char[newSize];
_currentSize = 0;
_currentSegment = curr;
}
private char[] buildResultArray()
{
if (_resultString != null) { // Can take a shortcut...
return _resultString.toCharArray();
}
char[] result;
int size = size();
if (size < 1) {
return sNoChars;
}
int offset = 0;
result = new char[size];
if (_segments != null) {
for (int i = 0, len = _segments.size(); i < len; ++i) {
char[] curr = _segments.get(i);
int currLen = curr.length;
System.arraycopy(curr, 0, result, offset, currLen);
offset += currLen;
}
}
System.arraycopy(_currentSegment, 0, result, offset, _currentSize);
return result;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy