lux.xml.OffsetDocBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lux Show documentation
Lux XML search engine
There is a newer version: 1.1.0
package lux.xml;

import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.SaxonApiException;

/**
 * Holds an Offsets object that will accumulate offset information about a 
 * document as it is parsed.  Note that the Offsets is reused for subsequent parses.
 * Its contents will be overwritten when a document is read.  
 */
public class OffsetDocBuilder extends SaxonDocBuilder {

    private final Offsets offsets;
    private int lastTextLocation;

    private boolean fixupCRLF = false;

    /**
     * @param processor a Saxon processor.
     * @throws SaxonApiException 
     */
    
    public OffsetDocBuilder(Processor processor) throws SaxonApiException {
        super(processor);
        offsets = new Offsets();
    }
    
    @Override
    public void reset () {
        super.reset();
        offsets.reset();
    }
    
    @Override
    public void handleEvent(XMLStreamReader reader, int eventType) throws XMLStreamException {
        
        super.handleEvent(reader, eventType);
        
        switch (eventType) {

        case XMLStreamConstants.START_DOCUMENT:
            lastTextLocation = -1;
            break;

        case XMLStreamConstants.START_ELEMENT:
        case XMLStreamConstants.END_ELEMENT:
        case XMLStreamConstants.COMMENT:
        case XMLStreamConstants.PROCESSING_INSTRUCTION:
            recordOffsets(reader);
            break;

        case XMLStreamConstants.CDATA:
            recordOffsets(reader, 
                    reader.getLocation().getCharacterOffset() + "".length());
            break;
        
        case XMLStreamConstants.SPACE:
        case XMLStreamConstants.CHARACTERS:
            int textLength = reader.getTextLength();
            if (isFixupCRLF()) {
                if (reader.getTextCharacters()[reader.getTextStart()] == '\n') {
                    recordOffsets(reader, reader.getLocation().getCharacterOffset() + 1, textLength);
                } else {
                    recordOffsets(reader, reader.getLocation().getCharacterOffset(), textLength);                    
                }
                offsetCRLF(reader.getLocation().getCharacterOffset(), reader.getTextCharacters(), reader.getTextStart(), textLength);
            } else {
                recordOffsets(reader, reader.getLocation().getCharacterOffset(), textLength);                    
            }
            break;

        case XMLStreamConstants.ENTITY_REFERENCE:
            String text = reader.getText();
            recordOffsets(reader, reader.getLocation().getCharacterOffset(), text.length());
            break;
        }
    }
    
    // generate character offsets wherever there is a line feed (\n == 10)
    // since we're told it was a CRLF (\r\n = 13, 10) in the original text
    // XML parser are *required* to perform this "normalization"
    private void offsetCRLF(int location, char[] cbuf, int off, int size) {
        for (int i = off + 1; i < off + size; i++) {
            if (cbuf[i] == '\n') {
                offsets.addDelta(location + off - i, (short) 1);
            }
        }
    }

    // Keep track of the location at the end of the last text event, and use that to infer the presence of
    // a length-changing entity reference.  The lastTextLocation is reset to -1 in start
    // element events so that the first text event in an element will have its position
    // stored absolutely.  For each subsequent text-like event within the same text node
    // (which will occur because entity references are reported as separate events),
    // compute a delta based on the difference of the end offset of the last text event
    // and the start offset of this text event.  Store the delta for use by an offset-correcting
    // CharStream.    
    private void recordOffsets(XMLStreamReader reader, int location, int textLength) throws XMLStreamException {    
        if (lastTextLocation < 0) {
            offsets.addOffset (location);
        } else {
            offsets.addDelta (location, (short) (location - lastTextLocation));      
        }
        lastTextLocation = location + textLength;
    }
    
    private void recordOffsets(XMLStreamReader reader) throws XMLStreamException {
        int location = reader.getLocation().getCharacterOffset();
        if (lastTextLocation >= 0 && location > lastTextLocation) {
            offsets.addDelta (location, (short) (location - lastTextLocation)); 
        }
        lastTextLocation = -1;
    }
    
    /**
     * @return the offsets accumulated for the parsed document.  This object is only valid
     * after a document has been parsed, and in any case may be null if it setOffsets(null) was
     * called.
     */
    @Override
    public Offsets getOffsets() {
        return offsets;
    }
    
    public boolean isFixupCRLF() {
        return fixupCRLF;
    }

    public void setFixupCRLF(boolean fixupCRLF) {
        this.fixupCRLF = fixupCRLF;
    }
}