All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.fasterxml.aalto.in.AttributeCollector Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.text.MessageFormat;

import javax.xml.namespace.QName;
import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.typed.Base64Variant;
import org.codehaus.stax2.typed.TypedArrayDecoder;
import org.codehaus.stax2.typed.TypedValueDecoder;
import org.codehaus.stax2.typed.TypedXMLStreamException;

import org.codehaus.stax2.ri.typed.CharArrayBase64Decoder;
import org.codehaus.stax2.ri.typed.ValueDecoderFactory;


import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;

/**
 * Object used by the tokenizer to collect and store information
 * about attributes, specifically, names and values.
 *

*/ public final class AttributeCollector { private final static int INT_SPACE = 0x0020; /** * Let's guess that most of the time there won't be more than * 12 attributes. Since the underlying buffer will be expanded * as necessary, exact value is chosen to minimize overhead * rather than eliminate any resizing. */ private final static int DEFAULT_ENTRY_COUNT = 12; /** * The default length of the value buffer is also chosen more * to minimize overhead than to eliminate all need for resizing. */ private final static int DEFAULT_BUFFER_LENGTH = 120; // // // Configuration final ReaderConfig _config; // // // State: actual collected attributes /** * Number of attributes currently held by this collector. */ private int _attrCount; private PName[] _names = null; /** * Consequtive character array, in which attribute values are * concatenated in */ private char[] _valueBuffer = null; // // // State: hash table (-like structure) for attributes /** * Int-based compact data structure that contains mapping from * attribute names to attribute indexes in the main attribute name array. *

* Data structure contains two separate areas; main hash area (with * size _hashAreaSize), and remaining spillover area * that follows hash area up until (but not including) * _spillAreaEnd index. * Main hash area only contains indexes (index+1; 0 signifying empty slot) * to actual attributes; spillover area has both hash and index for * any spilled entry. Spilled entries are simply stored in order * added, and need to be searched using linear search. In case of both * primary hash hits and spills, eventual comparison with the local * name needs to be done with actual name array. */ protected int[] _attrMap = null; /** * Size of hash area in _attrMap; generally at least 20% * more than number of attributes (_attrCount). */ protected int _hashAreaSize; /** * Pointer to int slot right after last spill entry, in * _attrMap array. */ protected int _spillAreaEnd; // // // State: work-in-progress: /** * Array that contains ending offsets of the values in the shared * buffer. Entries contain character offset after the end of * the matching offset; so entry 0 for example contains starting * offset of the entry 1. */ private int[] _valueOffsets = null; /** * Flag used to indicate that all attribute values for an element * have been parsed, and that next call to startNewValue * should reset the value structures */ private boolean _needToResetValues = true; /** * For some errors, we'll have to temporarily store error message, * to be thrown at a later point. */ private String _errorMsg = null; // // // Temporary storage for optimizations /** * Concatenated String that contains all the attribute values * for the element. Allows some buffer reuse, and should result * in slight speed optimization, for elements with lots of * attributes that are usually all (or none) accessed. */ private String _allAttrValues = null; /* /********************************************************************** /* Life-cycle methods (creation, further construction) /********************************************************************** */ protected AttributeCollector(ReaderConfig cfg) { _config = cfg; _attrCount = 0; } /** * Method called by the parser right after attribute name has been * parsed, but before value has been parsed. * * @return Underlying character buffer to use for storing attribute * value characters */ public char[] startNewValue(PName attrName, int currOffset) { int count; if (_needToResetValues) { _needToResetValues = false; _attrCount = count = 0; _allAttrValues = null; if (_valueBuffer == null) { // first time for this instance _names = new PName[DEFAULT_ENTRY_COUNT]; _valueBuffer = new char[DEFAULT_BUFFER_LENGTH]; _valueOffsets = new int[DEFAULT_ENTRY_COUNT]; } } else { // Not enough room for a new entry? count = _attrCount; if (count >= _valueOffsets.length) { int[] oldVal = _valueOffsets; PName[] oldNames = _names; int oldLen = oldVal.length; int newLen = oldLen + oldLen; _valueOffsets = new int[newLen]; _names = new PName[newLen]; for (int i = 0; i < oldLen; ++i) { _valueOffsets[i] = oldVal[i]; _names[i] = oldNames[i]; } } if (count > 0) { // no predecessor for the first entry _valueOffsets[count-1] = currOffset; } } _names[count] = attrName; ++_attrCount; return _valueBuffer; } public char[] continueValue() { return _valueBuffer; } /** * Method called after all attribute entries have been parsed, * and thus the end of the last value in the buffer is known. * * @return Number of attributes collected */ public final int finishLastValue(int endingOffset) { // Did we get any values? if (_needToResetValues) { // nope return 0; } _needToResetValues = true; // so it'll get reset next time a value is started // Since a previous startNewValue checked buffers, no check needed int count = _attrCount; _valueOffsets[count-1] = endingOffset; /* So far so good. But now, also need to ensure there are no * duplicates. This also allows us to create a hash for efficient * access by name as a side effect. Since hash table building * overhead is somewhat significant, let's only use it for 3 or * more attributes. */ if (count < 3) { _hashAreaSize = 0; if (count == 2) { PName[] names = _names; if (names[0].boundEquals(names[1])) { noteDupAttr(0, 1); return -1; } } return count; } return finishLastValue2(); } public final int finishLastValue2() { int count = _attrCount; PName[] names = _names; // Ok, nope, better use a hash: /* Ok, finally, let's create attribute map, to allow efficient * access by prefix+localname combination. Could do it on-demand, * but this way we can check for duplicates right away. */ int[] map = _attrMap; /* What's minimum size to contain at most 80% full hash area, * plus 1/8 spill area (12.5% spilled entries, two ints each)? * Since we'll need 8 for 4 entries and up, and minimum to get * here is 3 entries, let's just skip 4 entry map... */ int hashCount = 8; { int min = count + (count >> 2); // == 80% fill rate /* Need to get 2^N size that can contain all elements, with * 80% fill rate */ while (hashCount < min) { hashCount += hashCount; // 2x } // And then add the spill area _hashAreaSize = hashCount; min = hashCount + (hashCount >> 4); // 12.5 x 2 ints if (map == null || map.length < min) { map = new int[min]; } else { /* Need to clear old hash entries (if any). But note that * spilled entries we can leave alone -- they are just ints, * and get overwritten if and as needed */ map[0] = map[1] = map[2] = map[3] = map[4] = map[5] = map[6] = map[7] = 0; for (int i = 8; i < hashCount; ++i) { map[i] = 0; } } } { int mask = hashCount-1; int spillIndex = hashCount; // Ok, array's fine, let's hash 'em in! for (int i = 0; i < count; ++i) { PName newName = names[i]; int hash = newName.boundHashCode(); int index = hash & mask; // Hash slot available? int oldNameIndex = map[index]; if (oldNameIndex == 0) { // yup map[index] = i+1; // since 0 is marker } else { // nope, collision, need to spill --oldNameIndex; // to unmask 0 etc // But first, is it a dup? if (names[oldNameIndex].boundEquals(newName)) { // Only first collision needs to be reported if (_errorMsg == null) { noteDupAttr(oldNameIndex, i); } /* let's still continue to build hash, even if there's * collision; to keep data as consistent (and accessible) * as possible */ } /* Is there room to spill into? (need to 2 int spaces; * one for hash, the other for index) */ if ((spillIndex + 1)>= map.length) { // Let's just add room for 4 spills... map = DataUtil.growArrayBy(map, 8); } // Let's first ensure we aren't adding a dup: for (int j = hashCount; j < spillIndex; j += 2) { if (map[j] == hash) { oldNameIndex = map[j+1]; if (names[oldNameIndex].boundEquals(newName)) { if (_errorMsg == null) { noteDupAttr(oldNameIndex, i); } break; } } } map[spillIndex++] = hash; map[spillIndex++] = i; // no need to mask 0 } } _spillAreaEnd = spillIndex; } _attrMap = map; return (_errorMsg == null) ? count : -1; } /** * Method called by the owner, when the */ public char[] valueBufferFull() { /* Let's just double the size as necessary? Could also grow * by less (50%?)... but shouldn't greatly matter */ _valueBuffer = DataUtil.growArrayBy(_valueBuffer, _valueBuffer.length); return _valueBuffer; } /* /********************************************************************** /* Accessors /********************************************************************** */ public final int getCount() { return _attrCount; } public final PName getName(int index) { return _names[index]; } public final QName getQName(int index) { return _names[index].constructQName(); } public String getValue(int index) { int count = _attrCount; /* Note: no checks, caller is to ensure index is ok. Acceptable * since it's not externally exposed */ if (_allAttrValues == null) { int len = _valueOffsets[count-1]; _allAttrValues = (len == 0) ? "" : new String(_valueBuffer, 0, len); } if (index == 0) { if (count == 1) { // Degenerate case; only one substring? return _allAttrValues; } int len = _valueOffsets[0]; return (len == 0) ? "" : _allAttrValues.substring(0, len); } /* !!! 11-Nov-2006, tatus: Should we cache constructed value? * Might be worth the trouble */ int start = _valueOffsets[index-1]; int end = _valueOffsets[index]; return (start == end) ? "" : _allAttrValues.substring(start, end); } public String getValue(String nsUri, String localName) { int ix = findIndex(nsUri, localName); return (ix >= 0) ? getValue(ix) : null; } public int findIndex(String nsUri, String localName) { int hashSize = _hashAreaSize; // No hash? Linear search, then: if (hashSize < 1) { for (int i = 0, len = _attrCount; i < len; ++i) { PName curr = _names[i]; if (curr.boundEquals(nsUri, localName)) { return i; } } return -1; } // Need to/can use hash... primary hit? int hash = PName.boundHashCode(nsUri, localName); int ix = _attrMap[hash & (hashSize-1)]; if (ix > 0) { // has primary entry, does it match? --ix; // Is primary candidate match? if (_names[ix].boundEquals(nsUri, localName)) { return ix; } /* Nope, need to traverse spill list, which has 2 entries for * each spilled attribute id; first for hash value, second index. */ for (int i = hashSize, len = _spillAreaEnd; i < len; i += 2) { if (_attrMap[i] != hash) { continue; } /* Note: spill indexes are not off-by-one, since there's * no need to mask 0 */ ix = _attrMap[i+1]; if (_names[ix].boundEquals(nsUri, localName)) { return ix; } } } return -1; } public String getErrorMsg() { return _errorMsg; } /* /********************************************************************** /* Type-safe accessors to support TypedXMLStreamReader /********************************************************************** */ public final void decodeValue(int index, TypedValueDecoder dec) throws IllegalArgumentException { if (index < 0 || index >= _attrCount) { throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes"); } // No cached String values, better just pass char array ref int start, end; if (index == 0) { start = 0; end = _valueOffsets[0]; } else { start = _valueOffsets[index-1]; end = _valueOffsets[index]; } // Nonetheless, must trim before passing the value final char[] buf = _valueBuffer; while (true) { if (start >= end) { dec.handleEmptyValue(); return; } if (!isSpace(buf[start])) { break; } ++start; } // Trailing space? while (--end > start && isSpace(buf[end])) { } dec.decode(buf, start, end+1); } public final int decodeValues(int index, TypedArrayDecoder dec, XmlScanner scanner) throws XMLStreamException { if (index < 0 || index >= _attrCount) { throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes"); } int start, end; if (index == 0) { start = 0; end = _valueOffsets[0]; } else { start = _valueOffsets[index-1]; end = _valueOffsets[index]; } return decodeValues(dec, _valueBuffer, start, end, scanner); } private final int decodeValues(TypedArrayDecoder dec, final char[] buf, int ptr, final int end, final XmlScanner scanner) throws XMLStreamException { int start = ptr; int count = 0; try { decode_loop: while (ptr < end) { // First, any space to skip? while (buf[ptr] <= INT_SPACE) { if (++ptr >= end) { break decode_loop; } } // Then let's figure out non-space char (token) start = ptr; ++ptr; while (ptr < end && buf[ptr] > INT_SPACE) { ++ptr; } int tokenEnd = ptr; ++ptr; // to skip trailing space (or, beyond end) // Ok, decode... any more room? ++count; if (dec.decodeValue(buf, start, tokenEnd)) { if (!checkExpand(dec)) { break; } } } } catch (IllegalArgumentException iae) { // Need to convert to a checked stream exception Location loc = scanner.getCurrentLocation(); String lexical = new String(buf, start, (ptr-start)); throw new TypedXMLStreamException(lexical, iae.getMessage(), loc, iae); } return count; } public byte[] decodeBinaryValue(int index, Base64Variant v, CharArrayBase64Decoder dec, XmlScanner scanner) throws XMLStreamException { if (index < 0 || index >= _attrCount) { throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes"); } int start, end; if (index == 0) { start = 0; end = _valueOffsets[0]; } else { start = _valueOffsets[index-1]; end = _valueOffsets[index]; } int len = end-start; dec.init(v, true, _valueBuffer, start, end, /* addl segments */ null); try { return dec.decodeCompletely(); } catch (IllegalArgumentException iae) { // Need to convert to a checked stream exception String lexical = new String(_valueBuffer, start, len); throw new TypedXMLStreamException(lexical, iae.getMessage(), scanner.getCurrentLocation(), iae); } } private final static boolean isSpace(char c) { return ((int) c) <= INT_SPACE; } /** * Internal method used to see if we can expand the buffer that * the array decoder has. Bit messy, but simpler than having * separately typed instances; and called rarely so that performance * downside of instanceof is irrelevant. */ private final boolean checkExpand(TypedArrayDecoder tad) { if (tad instanceof ValueDecoderFactory.BaseArrayDecoder) { ((ValueDecoderFactory.BaseArrayDecoder) tad).expand(); return true; } return false; } /* /********************************************************************** /* Internal methods /********************************************************************** */ private void noteDupAttr(int ix1, int ix2) { _errorMsg = MessageFormat.format(ErrorConsts.ERR_WF_DUP_ATTRS, new Object[] { _names[ix1].toString(), new Integer(ix1), _names[ix2].toString(), new Integer(ix2) }); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy