com.fasterxml.aalto.in.AttributeCollector Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aalto-xml Show documentation
Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2)
There is a newer version: 1.3.3
/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, [email protected]
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.text.MessageFormat;

import javax.xml.namespace.QName;
import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.typed.Base64Variant;
import org.codehaus.stax2.typed.TypedArrayDecoder;
import org.codehaus.stax2.typed.TypedValueDecoder;
import org.codehaus.stax2.typed.TypedXMLStreamException;

import org.codehaus.stax2.ri.typed.CharArrayBase64Decoder;
import org.codehaus.stax2.ri.typed.ValueDecoderFactory;


import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;

/**
 * Object used by the tokenizer to collect and store information
 * about attributes, specifically, names and values.
 *
 */
public final class AttributeCollector
{
    private final static int INT_SPACE = 0x0020;

    /**
     * Let's guess that most of the time there won't be more than
     * 12 attributes. Since the underlying buffer will be expanded
     * as necessary, exact value is chosen to minimize overhead
     * rather than eliminate any resizing.
     */
    private final static int DEFAULT_ENTRY_COUNT = 12;

    /**
     * The default length of the value buffer is also chosen more
     * to minimize overhead than to eliminate all need for resizing.
     */
    private final static int DEFAULT_BUFFER_LENGTH = 120;

    // // // Configuration

    final ReaderConfig _config;

    // // // State: actual collected attributes

    /**
     * Number of attributes currently held by this collector.
     */
    private int _attrCount;

    private PName[] _names = null;

    /**
     * Consequtive character array, in which attribute values are
     * concatenated in
     */
    private char[] _valueBuffer = null;

    // // // State: hash table (-like structure) for attributes

    /**
     * Int-based compact data structure that contains mapping from
     * attribute names to attribute indexes in the main attribute name array.
     *
     * Data structure contains two separate areas; main hash area (with
     * size _hashAreaSize), and remaining spillover area
     * that follows hash area up until (but not including)
     * _spillAreaEnd index.
     * Main hash area only contains indexes (index+1; 0 signifying empty slot)
     * to actual attributes; spillover area has both hash and index for
     * any spilled entry. Spilled entries are simply stored in order
     * added, and need to be searched using linear search. In case of both
     * primary hash hits and spills, eventual comparison with the local
     * name needs to be done with actual name array.
     */
    protected int[] _attrMap = null;

    /**
     * Size of hash area in _attrMap; generally at least 20%
     * more than number of attributes (_attrCount).
     */
    protected int _hashAreaSize;

    /**
     * Pointer to int slot right after last spill entry, in
     * _attrMap array.
     */
    protected int _spillAreaEnd;

    // // // State: work-in-progress:

    /**
     * Array that contains ending offsets of the values in the shared
     * buffer. Entries contain character offset after the end of
     * the matching offset; so entry 0 for example contains starting
     * offset of the entry 1.
     */
    private int[] _valueOffsets = null;

    /**
     * Flag used to indicate that all attribute values for an element
     * have been parsed, and that next call to startNewValue
     * should reset the value structures
     */
    private boolean _needToResetValues = true;

    /**
     * For some errors, we'll have to temporarily store error message,
     * to be thrown at a later point.
     */
    private String _errorMsg = null;

    // // // Temporary storage for optimizations

    /**
     * Concatenated String that contains all the attribute values
     * for the element. Allows some buffer reuse, and should result
     * in slight speed optimization, for elements with lots of
     * attributes that are usually all (or none) accessed.
     */
    private String _allAttrValues = null;

    /*
    /**********************************************************************
    /* Life-cycle methods (creation, further construction)
    /**********************************************************************
     */

    protected AttributeCollector(ReaderConfig cfg)
    {
        _config = cfg;
        _attrCount = 0;
    }

    /**
     * Method called by the parser right after attribute name has been
     * parsed, but before value has been parsed.
     *
     * @return Underlying character buffer to use for storing attribute
     *   value characters
     */
    public char[] startNewValue(PName attrName, int currOffset)
    {
        int count;

        if (_needToResetValues) {
            _needToResetValues = false;
            _attrCount = count = 0;
            _allAttrValues = null;
            if (_valueBuffer == null) { // first time for this instance
                _names = new PName[DEFAULT_ENTRY_COUNT];
                _valueBuffer = new char[DEFAULT_BUFFER_LENGTH];
                _valueOffsets = new int[DEFAULT_ENTRY_COUNT];
            }
        } else {
            // Not enough room for a new entry?
            count = _attrCount;
            if (count >= _valueOffsets.length) {
                int[] oldVal = _valueOffsets;
                PName[] oldNames = _names;
                int oldLen = oldVal.length;
                int newLen = oldLen + oldLen;
                _valueOffsets = new int[newLen];
                _names = new PName[newLen];
                for (int i = 0; i < oldLen; ++i) {
                    _valueOffsets[i] = oldVal[i];
                    _names[i] = oldNames[i];
                }
            }
            if (count > 0) { // no predecessor for the first entry
                _valueOffsets[count-1] = currOffset;
            }
        }
        _names[count] = attrName;
        ++_attrCount;
        return _valueBuffer;
    }

    public char[] continueValue()
    {
        return _valueBuffer;
    }

    /**
     * Method called after all attribute entries have been parsed,
     * and thus the end of the last value in the buffer is known.
     *
     * @return Number of attributes collected
     */
    public final int finishLastValue(int endingOffset)
    {
        // Did we get any values?
        if (_needToResetValues) { // nope
            return 0;
        }
        _needToResetValues = true; // so it'll get reset next time a value is started

        // Since a previous startNewValue checked buffers, no check needed
        int count = _attrCount;
        _valueOffsets[count-1] = endingOffset;

        /* So far so good. But now, also need to ensure there are no
         * duplicates. This also allows us to create a hash for efficient
         * access by name as a side effect. Since hash table building
         * overhead is somewhat significant, let's only use it for 3 or
         * more attributes.
         */
        if (count < 3) {
            _hashAreaSize = 0;
            if (count == 2) {
                PName[] names = _names;
                if (names[0].boundEquals(names[1])) {
                    noteDupAttr(0, 1);
                    return -1;
                }
            }
            return count;
        }
        return finishLastValue2();
    }

    public final int finishLastValue2()
    {
        int count = _attrCount;
        PName[] names = _names;

        // Ok, nope, better use a hash:
        /* Ok, finally, let's create attribute map, to allow efficient
         * access by prefix+localname combination. Could do it on-demand,
         * but this way we can check for duplicates right away.
         */
        int[] map = _attrMap;

        /* What's minimum size to contain at most 80% full hash area,
         * plus 1/8 spill area (12.5% spilled entries, two ints each)?
         * Since we'll need 8 for 4 entries and up, and minimum to get
         * here is 3 entries, let's just skip 4 entry map...
         */
        int hashCount = 8;
        {
            int min = count + (count >> 2); // == 80% fill rate
            /* Need to get 2^N size that can contain all elements, with
             * 80% fill rate
             */
            while (hashCount < min) {
                hashCount += hashCount; // 2x
            }
            // And then add the spill area
            _hashAreaSize = hashCount;
            min = hashCount + (hashCount >> 4); // 12.5 x 2 ints
            if (map == null || map.length < min) {
                map = new int[min];
            } else {
                /* Need to clear old hash entries (if any). But note that
                 * spilled entries we can leave alone -- they are just ints,
                 * and get overwritten if and as needed
                 */
                map[0] = map[1] = map[2] = map[3] = map[4] = map[5] = map[6] = map[7] = 0;
                for (int i = 8; i < hashCount; ++i) {
                    map[i] = 0;
                }
            }
        }

        {
            int mask = hashCount-1;
            int spillIndex = hashCount;

            // Ok, array's fine, let's hash 'em in!
            for (int i = 0; i < count; ++i) {
                PName newName = names[i];
                int hash = newName.boundHashCode();
                int index = hash & mask;
                // Hash slot available?
                int oldNameIndex = map[index];
                if (oldNameIndex == 0) { // yup
                    map[index] = i+1; // since 0 is marker
                } else { // nope, collision, need to spill
                    --oldNameIndex; // to unmask 0 etc
                    // But first, is it a dup?
                    if (names[oldNameIndex].boundEquals(newName)) {
                        // Only first collision needs to be reported
                        if (_errorMsg == null) {
                            noteDupAttr(oldNameIndex, i);
                        }
                        /* let's still continue to build hash, even if there's
                         * collision; to keep data as consistent (and accessible)
                         * as possible
                         */
                    }
                    /* Is there room to spill into? (need to 2 int spaces;
                     * one for hash, the other for index)
                     */
                    if ((spillIndex + 1)>= map.length) {
                        // Let's just add room for 4 spills...
                        map = DataUtil.growArrayBy(map, 8);
                    }
                    // Let's first ensure we aren't adding a dup:
                    for (int j = hashCount; j < spillIndex; j += 2) {
                        if (map[j] == hash) {
                            oldNameIndex = map[j+1];
                            if (names[oldNameIndex].boundEquals(newName)) {
                                if (_errorMsg == null) { 
                                    noteDupAttr(oldNameIndex, i);
                                }
                                break;
                            }
                        }
                    }
                    map[spillIndex++] = hash;
                    map[spillIndex++] = i; // no need to mask 0
                }
            }
            _spillAreaEnd = spillIndex;
        }
        _attrMap = map;

        return (_errorMsg == null) ? count : -1;
    }

    /**
     * Method called by the owner, when the 
     */
    public char[] valueBufferFull()
    {
        /* Let's just double the size as necessary? Could also grow
         * by less (50%?)... but shouldn't greatly matter
         */
        _valueBuffer = DataUtil.growArrayBy(_valueBuffer, _valueBuffer.length);
        return _valueBuffer;
    }

    /*
    /**********************************************************************
    /* Accessors
    /**********************************************************************
     */

    public final int getCount() { return _attrCount; }

    public final PName getName(int index) { return _names[index]; }

    public final QName getQName(int index)
    {
        return _names[index].constructQName();
    }

    public String getValue(int index)
    {
        int count = _attrCount;

        /* Note: no checks, caller is to ensure index is ok. Acceptable
         * since it's not externally exposed
         */
        if (_allAttrValues == null) {
            int len = _valueOffsets[count-1];
            _allAttrValues = (len == 0) ? "" : new String(_valueBuffer, 0, len);
        }
        if (index == 0) {
            if (count == 1) { // Degenerate case; only one substring?
                return _allAttrValues;
            }
            int len = _valueOffsets[0];
            return (len == 0) ? "" : _allAttrValues.substring(0, len);
        }
        /* !!! 11-Nov-2006, tatus: Should we cache constructed value?
         *   Might be worth the trouble
         */
        int start = _valueOffsets[index-1];
        int end = _valueOffsets[index];
        return (start == end) ? "" : _allAttrValues.substring(start, end);
    }

    public String getValue(String nsUri, String localName)
    {
        int ix = findIndex(nsUri, localName);
        return (ix >= 0) ? getValue(ix) : null;
    }

    public int findIndex(String nsUri, String localName)
    {
        int hashSize = _hashAreaSize;

        // No hash? Linear search, then:
        if (hashSize < 1) {
            for (int i = 0, len = _attrCount; i < len; ++i) {
                PName curr = _names[i];
                if (curr.boundEquals(nsUri, localName)) {
                    return i;
                }
            }
            return -1;
        }

        // Need to/can use hash... primary hit?
        int hash = PName.boundHashCode(nsUri, localName);
        int ix = _attrMap[hash & (hashSize-1)];

        if (ix > 0) { // has primary entry, does it match?
            --ix;
            // Is primary candidate match?
            if (_names[ix].boundEquals(nsUri, localName)) {
                return ix;
            }
            /* Nope, need to traverse spill list, which has 2 entries for
             * each spilled attribute id; first for hash value, second index.
             */
            for (int i = hashSize, len = _spillAreaEnd; i < len; i += 2) {
                if (_attrMap[i] != hash) {
                    continue;
                }
                /* Note: spill indexes are not off-by-one, since there's
                 * no need to mask 0
                 */
                ix = _attrMap[i+1];
                if (_names[ix].boundEquals(nsUri, localName)) {
                    return ix;
                }
            }
        }

        return -1;
    }

    public String getErrorMsg() { return _errorMsg; }

    /*
    /**********************************************************************
    /* Type-safe accessors to support TypedXMLStreamReader
    /**********************************************************************
     */

    public final void decodeValue(int index, TypedValueDecoder dec)
        throws IllegalArgumentException
    {
        if (index < 0 || index >= _attrCount) {
            throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
        }
        // No cached String values, better just pass char array ref
        int start, end;

        if (index == 0) {
            start = 0;
            end = _valueOffsets[0];
        } else {
            start = _valueOffsets[index-1];
            end = _valueOffsets[index];
        }
        // Nonetheless, must trim before passing the value
        final char[] buf = _valueBuffer;
        while (true) {
            if (start >= end) {
                dec.handleEmptyValue();
                return;
            }
            if (!isSpace(buf[start])) {
                break;
            }
            ++start;
        }
        // Trailing space?
        while (--end > start && isSpace(buf[end])) { }
        dec.decode(buf, start, end+1);
    }

    public final int decodeValues(int index, TypedArrayDecoder dec, XmlScanner scanner)
        throws XMLStreamException
    {
        if (index < 0 || index >= _attrCount) {
            throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
        }
        int start, end;

        if (index == 0) {
            start = 0;
            end = _valueOffsets[0];
        } else {
            start = _valueOffsets[index-1];
            end = _valueOffsets[index];
        }
        return decodeValues(dec, _valueBuffer, start, end, scanner);
    }

    private final int decodeValues(TypedArrayDecoder dec,
                                   final char[] buf, int ptr, final int end,
                                   final XmlScanner scanner)
        throws XMLStreamException
    {
        int start = ptr;
        int count = 0;

        try {
            decode_loop:
            while (ptr < end) {
                // First, any space to skip?
                while (buf[ptr] <= INT_SPACE) {
                    if (++ptr >= end) {
                        break decode_loop;
                    }
                }
                // Then let's figure out non-space char (token)
                start = ptr;
                ++ptr;
                while (ptr < end && buf[ptr] > INT_SPACE) {
                    ++ptr;
                }
                int tokenEnd = ptr;
                ++ptr; // to skip trailing space (or, beyond end)
                // Ok, decode... any more room?
                ++count;
                if (dec.decodeValue(buf, start, tokenEnd)) {
                    if (!checkExpand(dec)) {
                        break;
                    }
                }
            }
        } catch (IllegalArgumentException iae) {
            // Need to convert to a checked stream exception
            Location loc = scanner.getCurrentLocation();
            String lexical = new String(buf, start, (ptr-start));
            throw new TypedXMLStreamException(lexical, iae.getMessage(), loc, iae);
        }
        return count;
    }

    public byte[] decodeBinaryValue(int index, Base64Variant v, CharArrayBase64Decoder dec, XmlScanner scanner)
        throws XMLStreamException
    {
        if (index < 0 || index >= _attrCount) {
            throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
        }
        int start, end;

        if (index == 0) {
            start = 0;
            end = _valueOffsets[0];
        } else {
            start = _valueOffsets[index-1];
            end = _valueOffsets[index];
        }
        int len = end-start;
        dec.init(v, true, _valueBuffer, start, end, /* addl segments */  null);
        try {
            return dec.decodeCompletely();
        } catch (IllegalArgumentException iae) {
            // Need to convert to a checked stream exception
            String lexical = new String(_valueBuffer, start, len);
            throw new TypedXMLStreamException(lexical, iae.getMessage(), scanner.getCurrentLocation(), iae);
        }
    }

    private final static boolean isSpace(char c)
    {
        return ((int) c) <= INT_SPACE;
    }

    /**
     * Internal method used to see if we can expand the buffer that
     * the array decoder has. Bit messy, but simpler than having
     * separately typed instances; and called rarely so that performance
     * downside of instanceof is irrelevant.
     */
    private final boolean checkExpand(TypedArrayDecoder tad)
    {
        if (tad instanceof ValueDecoderFactory.BaseArrayDecoder) {
            ((ValueDecoderFactory.BaseArrayDecoder) tad).expand();
            return true;
        }
        return false;
    }

    /*
    /**********************************************************************
    /* Internal methods
    /**********************************************************************
     */

    private void noteDupAttr(int ix1, int ix2)
    {
        _errorMsg = MessageFormat.format(ErrorConsts.ERR_WF_DUP_ATTRS,
                                         new Object[] {
                                             _names[ix1].toString(),
                                             new Integer(ix1),
                                             _names[ix2].toString(),
                                             new Integer(ix2)
                                         });

    }
}