com.fasterxml.aalto.in.AttributeCollector Maven / Gradle / Ivy
Show all versions of aalto-xml Show documentation
/* Aalto XML processor
*
* Copyright (c) 2006- Tatu Saloranta, [email protected]
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.text.MessageFormat;
import javax.xml.namespace.QName;
import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.typed.Base64Variant;
import org.codehaus.stax2.typed.TypedArrayDecoder;
import org.codehaus.stax2.typed.TypedValueDecoder;
import org.codehaus.stax2.typed.TypedXMLStreamException;
import org.codehaus.stax2.ri.typed.CharArrayBase64Decoder;
import org.codehaus.stax2.ri.typed.ValueDecoderFactory;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;
/**
* Object used by the tokenizer to collect and store information
* about attributes, specifically, names and values.
*
*/
public final class AttributeCollector
{
private final static int INT_SPACE = 0x0020;
/**
* Let's guess that most of the time there won't be more than
* 12 attributes. Since the underlying buffer will be expanded
* as necessary, exact value is chosen to minimize overhead
* rather than eliminate any resizing.
*/
private final static int DEFAULT_ENTRY_COUNT = 12;
/**
* The default length of the value buffer is also chosen more
* to minimize overhead than to eliminate all need for resizing.
*/
private final static int DEFAULT_BUFFER_LENGTH = 120;
// // // Configuration
final ReaderConfig _config;
// // // State: actual collected attributes
/**
* Number of attributes currently held by this collector.
*/
private int _attrCount;
private PName[] _names = null;
/**
* Consequtive character array, in which attribute values are
* concatenated in
*/
private char[] _valueBuffer = null;
// // // State: hash table (-like structure) for attributes
/**
* Int-based compact data structure that contains mapping from
* attribute names to attribute indexes in the main attribute name array.
*
* Data structure contains two separate areas; main hash area (with
* size _hashAreaSize
), and remaining spillover area
* that follows hash area up until (but not including)
* _spillAreaEnd
index.
* Main hash area only contains indexes (index+1; 0 signifying empty slot)
* to actual attributes; spillover area has both hash and index for
* any spilled entry. Spilled entries are simply stored in order
* added, and need to be searched using linear search. In case of both
* primary hash hits and spills, eventual comparison with the local
* name needs to be done with actual name array.
*/
protected int[] _attrMap = null;
/**
* Size of hash area in _attrMap
; generally at least 20%
* more than number of attributes (_attrCount
).
*/
protected int _hashAreaSize;
/**
* Pointer to int slot right after last spill entry, in
* _attrMap
array.
*/
protected int _spillAreaEnd;
// // // State: work-in-progress:
/**
* Array that contains ending offsets of the values in the shared
* buffer. Entries contain character offset after the end of
* the matching offset; so entry 0 for example contains starting
* offset of the entry 1.
*/
private int[] _valueOffsets = null;
/**
* Flag used to indicate that all attribute values for an element
* have been parsed, and that next call to startNewValue
* should reset the value structures
*/
private boolean _needToResetValues = true;
/**
* For some errors, we'll have to temporarily store error message,
* to be thrown at a later point.
*/
private String _errorMsg = null;
// // // Temporary storage for optimizations
/**
* Concatenated String that contains all the attribute values
* for the element. Allows some buffer reuse, and should result
* in slight speed optimization, for elements with lots of
* attributes that are usually all (or none) accessed.
*/
private String _allAttrValues = null;
/*
/**********************************************************************
/* Life-cycle methods (creation, further construction)
/**********************************************************************
*/
protected AttributeCollector(ReaderConfig cfg)
{
_config = cfg;
_attrCount = 0;
}
/**
* Method called by the parser right after attribute name has been
* parsed, but before value has been parsed.
*
* @return Underlying character buffer to use for storing attribute
* value characters
*/
public char[] startNewValue(PName attrName, int currOffset)
{
int count;
if (_needToResetValues) {
_needToResetValues = false;
_attrCount = count = 0;
_allAttrValues = null;
if (_valueBuffer == null) { // first time for this instance
_names = new PName[DEFAULT_ENTRY_COUNT];
_valueBuffer = new char[DEFAULT_BUFFER_LENGTH];
_valueOffsets = new int[DEFAULT_ENTRY_COUNT];
}
} else {
// Not enough room for a new entry?
count = _attrCount;
if (count >= _valueOffsets.length) {
int[] oldVal = _valueOffsets;
PName[] oldNames = _names;
int oldLen = oldVal.length;
int newLen = oldLen + oldLen;
_valueOffsets = new int[newLen];
_names = new PName[newLen];
for (int i = 0; i < oldLen; ++i) {
_valueOffsets[i] = oldVal[i];
_names[i] = oldNames[i];
}
}
if (count > 0) { // no predecessor for the first entry
_valueOffsets[count-1] = currOffset;
}
}
_names[count] = attrName;
++_attrCount;
return _valueBuffer;
}
public char[] continueValue()
{
return _valueBuffer;
}
/**
* Method called after all attribute entries have been parsed,
* and thus the end of the last value in the buffer is known.
*
* @return Number of attributes collected
*/
public final int finishLastValue(int endingOffset)
{
// Did we get any values?
if (_needToResetValues) { // nope
return 0;
}
_needToResetValues = true; // so it'll get reset next time a value is started
// Since a previous startNewValue checked buffers, no check needed
int count = _attrCount;
_valueOffsets[count-1] = endingOffset;
/* So far so good. But now, also need to ensure there are no
* duplicates. This also allows us to create a hash for efficient
* access by name as a side effect. Since hash table building
* overhead is somewhat significant, let's only use it for 3 or
* more attributes.
*/
if (count < 3) {
_hashAreaSize = 0;
if (count == 2) {
PName[] names = _names;
if (names[0].boundEquals(names[1])) {
noteDupAttr(0, 1);
return -1;
}
}
return count;
}
return finishLastValue2();
}
public final int finishLastValue2()
{
int count = _attrCount;
PName[] names = _names;
// Ok, nope, better use a hash:
/* Ok, finally, let's create attribute map, to allow efficient
* access by prefix+localname combination. Could do it on-demand,
* but this way we can check for duplicates right away.
*/
int[] map = _attrMap;
/* What's minimum size to contain at most 80% full hash area,
* plus 1/8 spill area (12.5% spilled entries, two ints each)?
* Since we'll need 8 for 4 entries and up, and minimum to get
* here is 3 entries, let's just skip 4 entry map...
*/
int hashCount = 8;
{
int min = count + (count >> 2); // == 80% fill rate
/* Need to get 2^N size that can contain all elements, with
* 80% fill rate
*/
while (hashCount < min) {
hashCount += hashCount; // 2x
}
// And then add the spill area
_hashAreaSize = hashCount;
min = hashCount + (hashCount >> 4); // 12.5 x 2 ints
if (map == null || map.length < min) {
map = new int[min];
} else {
/* Need to clear old hash entries (if any). But note that
* spilled entries we can leave alone -- they are just ints,
* and get overwritten if and as needed
*/
map[0] = map[1] = map[2] = map[3] = map[4] = map[5] = map[6] = map[7] = 0;
for (int i = 8; i < hashCount; ++i) {
map[i] = 0;
}
}
}
{
int mask = hashCount-1;
int spillIndex = hashCount;
// Ok, array's fine, let's hash 'em in!
for (int i = 0; i < count; ++i) {
PName newName = names[i];
int hash = newName.boundHashCode();
int index = hash & mask;
// Hash slot available?
int oldNameIndex = map[index];
if (oldNameIndex == 0) { // yup
map[index] = i+1; // since 0 is marker
} else { // nope, collision, need to spill
--oldNameIndex; // to unmask 0 etc
// But first, is it a dup?
if (names[oldNameIndex].boundEquals(newName)) {
// Only first collision needs to be reported
if (_errorMsg == null) {
noteDupAttr(oldNameIndex, i);
}
/* let's still continue to build hash, even if there's
* collision; to keep data as consistent (and accessible)
* as possible
*/
}
/* Is there room to spill into? (need to 2 int spaces;
* one for hash, the other for index)
*/
if ((spillIndex + 1)>= map.length) {
// Let's just add room for 4 spills...
map = DataUtil.growArrayBy(map, 8);
}
// Let's first ensure we aren't adding a dup:
for (int j = hashCount; j < spillIndex; j += 2) {
if (map[j] == hash) {
oldNameIndex = map[j+1];
if (names[oldNameIndex].boundEquals(newName)) {
if (_errorMsg == null) {
noteDupAttr(oldNameIndex, i);
}
break;
}
}
}
map[spillIndex++] = hash;
map[spillIndex++] = i; // no need to mask 0
}
}
_spillAreaEnd = spillIndex;
}
_attrMap = map;
return (_errorMsg == null) ? count : -1;
}
/**
* Method called by the owner, when the
*/
public char[] valueBufferFull()
{
/* Let's just double the size as necessary? Could also grow
* by less (50%?)... but shouldn't greatly matter
*/
_valueBuffer = DataUtil.growArrayBy(_valueBuffer, _valueBuffer.length);
return _valueBuffer;
}
/*
/**********************************************************************
/* Accessors
/**********************************************************************
*/
public final int getCount() { return _attrCount; }
public final PName getName(int index) { return _names[index]; }
public final QName getQName(int index)
{
return _names[index].constructQName();
}
public String getValue(int index)
{
int count = _attrCount;
/* Note: no checks, caller is to ensure index is ok. Acceptable
* since it's not externally exposed
*/
if (_allAttrValues == null) {
int len = _valueOffsets[count-1];
_allAttrValues = (len == 0) ? "" : new String(_valueBuffer, 0, len);
}
if (index == 0) {
if (count == 1) { // Degenerate case; only one substring?
return _allAttrValues;
}
int len = _valueOffsets[0];
return (len == 0) ? "" : _allAttrValues.substring(0, len);
}
/* !!! 11-Nov-2006, tatus: Should we cache constructed value?
* Might be worth the trouble
*/
int start = _valueOffsets[index-1];
int end = _valueOffsets[index];
return (start == end) ? "" : _allAttrValues.substring(start, end);
}
public String getValue(String nsUri, String localName)
{
int ix = findIndex(nsUri, localName);
return (ix >= 0) ? getValue(ix) : null;
}
public int findIndex(String nsUri, String localName)
{
int hashSize = _hashAreaSize;
// No hash? Linear search, then:
if (hashSize < 1) {
for (int i = 0, len = _attrCount; i < len; ++i) {
PName curr = _names[i];
if (curr.boundEquals(nsUri, localName)) {
return i;
}
}
return -1;
}
// Need to/can use hash... primary hit?
int hash = PName.boundHashCode(nsUri, localName);
int ix = _attrMap[hash & (hashSize-1)];
if (ix > 0) { // has primary entry, does it match?
--ix;
// Is primary candidate match?
if (_names[ix].boundEquals(nsUri, localName)) {
return ix;
}
/* Nope, need to traverse spill list, which has 2 entries for
* each spilled attribute id; first for hash value, second index.
*/
for (int i = hashSize, len = _spillAreaEnd; i < len; i += 2) {
if (_attrMap[i] != hash) {
continue;
}
/* Note: spill indexes are not off-by-one, since there's
* no need to mask 0
*/
ix = _attrMap[i+1];
if (_names[ix].boundEquals(nsUri, localName)) {
return ix;
}
}
}
return -1;
}
public String getErrorMsg() { return _errorMsg; }
/*
/**********************************************************************
/* Type-safe accessors to support TypedXMLStreamReader
/**********************************************************************
*/
public final void decodeValue(int index, TypedValueDecoder dec)
throws IllegalArgumentException
{
if (index < 0 || index >= _attrCount) {
throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
}
// No cached String values, better just pass char array ref
int start, end;
if (index == 0) {
start = 0;
end = _valueOffsets[0];
} else {
start = _valueOffsets[index-1];
end = _valueOffsets[index];
}
// Nonetheless, must trim before passing the value
final char[] buf = _valueBuffer;
while (true) {
if (start >= end) {
dec.handleEmptyValue();
return;
}
if (!isSpace(buf[start])) {
break;
}
++start;
}
// Trailing space?
while (--end > start && isSpace(buf[end])) { }
dec.decode(buf, start, end+1);
}
public final int decodeValues(int index, TypedArrayDecoder dec, XmlScanner scanner)
throws XMLStreamException
{
if (index < 0 || index >= _attrCount) {
throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
}
int start, end;
if (index == 0) {
start = 0;
end = _valueOffsets[0];
} else {
start = _valueOffsets[index-1];
end = _valueOffsets[index];
}
return decodeValues(dec, _valueBuffer, start, end, scanner);
}
private final int decodeValues(TypedArrayDecoder dec,
final char[] buf, int ptr, final int end,
final XmlScanner scanner)
throws XMLStreamException
{
int start = ptr;
int count = 0;
try {
decode_loop:
while (ptr < end) {
// First, any space to skip?
while (buf[ptr] <= INT_SPACE) {
if (++ptr >= end) {
break decode_loop;
}
}
// Then let's figure out non-space char (token)
start = ptr;
++ptr;
while (ptr < end && buf[ptr] > INT_SPACE) {
++ptr;
}
int tokenEnd = ptr;
++ptr; // to skip trailing space (or, beyond end)
// Ok, decode... any more room?
++count;
if (dec.decodeValue(buf, start, tokenEnd)) {
if (!checkExpand(dec)) {
break;
}
}
}
} catch (IllegalArgumentException iae) {
// Need to convert to a checked stream exception
Location loc = scanner.getCurrentLocation();
String lexical = new String(buf, start, (ptr-start));
throw new TypedXMLStreamException(lexical, iae.getMessage(), loc, iae);
}
return count;
}
public byte[] decodeBinaryValue(int index, Base64Variant v, CharArrayBase64Decoder dec, XmlScanner scanner)
throws XMLStreamException
{
if (index < 0 || index >= _attrCount) {
throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
}
int start, end;
if (index == 0) {
start = 0;
end = _valueOffsets[0];
} else {
start = _valueOffsets[index-1];
end = _valueOffsets[index];
}
int len = end-start;
dec.init(v, true, _valueBuffer, start, end, /* addl segments */ null);
try {
return dec.decodeCompletely();
} catch (IllegalArgumentException iae) {
// Need to convert to a checked stream exception
String lexical = new String(_valueBuffer, start, len);
throw new TypedXMLStreamException(lexical, iae.getMessage(), scanner.getCurrentLocation(), iae);
}
}
private final static boolean isSpace(char c)
{
return ((int) c) <= INT_SPACE;
}
/**
* Internal method used to see if we can expand the buffer that
* the array decoder has. Bit messy, but simpler than having
* separately typed instances; and called rarely so that performance
* downside of instanceof is irrelevant.
*/
private final boolean checkExpand(TypedArrayDecoder tad)
{
if (tad instanceof ValueDecoderFactory.BaseArrayDecoder) {
((ValueDecoderFactory.BaseArrayDecoder) tad).expand();
return true;
}
return false;
}
/*
/**********************************************************************
/* Internal methods
/**********************************************************************
*/
private void noteDupAttr(int ix1, int ix2)
{
_errorMsg = MessageFormat.format(ErrorConsts.ERR_WF_DUP_ATTRS,
new Object[] {
_names[ix1].toString(),
new Integer(ix1),
_names[ix2].toString(),
new Integer(ix2)
});
}
}