lux.CachingDocReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lux Show documentation
Lux XML search engine
There is a newer version: 1.1.0
package lux;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;

import javax.xml.transform.stream.StreamSource;

import lux.exception.LuxException;
import lux.index.FieldName;
import lux.index.IndexConfiguration;
import lux.index.field.TinyBinaryField;
import lux.xml.tinybin.TinyBinary;
import net.sf.saxon.Configuration;
import net.sf.saxon.s9api.DocumentBuilder;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.tree.tiny.TinyDocumentImpl;
import net.sf.saxon.tree.tiny.TinyTree;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.slf4j.LoggerFactory;

/**
 * Reads, parses and caches XML documents from a Lucene index. Assigns Lucene
 * docIDs as Saxon document numbers. This reader and its cache are intended to survive for a
 * single query only, and is *not thread-safe*.
 */
public class CachingDocReader {
    // the (inverse of the) portion of heap to allocate to each per-request document cache.  
    // This should really rely on the number of clients
    private final static int CACHE_RATIO = 100;
	private final NodeCache cache = new NodeCache(java.lang.Runtime.getRuntime().maxMemory() / CACHE_RATIO);
    private final String xmlFieldName;
    private final String uriFieldName;
    private final DocumentBuilder builder;
    private final Configuration config;
    private int cacheHits = 0;
    private int cacheMisses = 0;
    private long buildTime = 0;

    /**
     * Create a CachingDocReader that will use the provided objects to read and
     * parse XML documents.
     * 
     * @param builder
     *            will be used to construct XML documents as XdmNodes
     * @param config
     *            assigns the proper document ID to each constructed document
     * @param indexConfig
     *            supplies the names of the xml storage and uri fields
     */
    public CachingDocReader(DocumentBuilder builder, Configuration config, IndexConfiguration indexConfig) {
        this.builder = builder;
        this.config = config;
        this.xmlFieldName = indexConfig.getFieldName(FieldName.XML_STORE);
        this.uriFieldName = indexConfig.getFieldName(FieldName.URI);
    }

    /**
     * Reads the document with the given relative id from an atomic reader.
     * If cached, the cached copy is returned. Otherwise the document is read from the index. 
     * If the document does not exist in the index, or has been deleted, results are not
     * well-defined: see {@link IndexReader}.
     * 
     * @param leafDocID the relative docid of the document to read
     * @param context an atomic Lucene index reader context (a leaf of the segmented index tree)
     * @return the document, as a Saxon XdmNode
     * @throws IOException if there is some sort of low-level problem with the index
     * @throws LuxException if there is an error building the document that has been retrieved
     */
    public XdmNode get(int leafDocID, AtomicReaderContext context) throws IOException {
       long docID = leafDocID + context.docBase;
       XdmNode node= cache.get(docID);
       if (node != null) {
           ++cacheHits;
           return node;
       }
       DocumentStoredFieldVisitor fieldSelector = new DocumentStoredFieldVisitor();
       context.reader().document(leafDocID, fieldSelector);
       Document document = fieldSelector.getDocument();
       return getXdmNode(docID, document);
    }
    
    /**
     * Reads the document with the given id. If cached, the cached copy is
     * returned. Otherwise the document is read from the index. If the document
     * does not exist in the index, or has been deleted, results are not
     * well-defined: see {@link IndexReader}.
     * 
     * @param docID the absolute docid of the document to read
     * @param reader the Lucene index reader
     * @return the document, as a Saxon XdmNode
     * @throws IOException if there is some sort of low-level problem with the index
     * @throws LuxException if there is an error building the document that has been retrieved
     */
    public XdmNode get(int docID, IndexReader reader) throws IOException {
        XdmNode node = cache.get(docID);
        if (node != null) {
            ++cacheHits;
            return node;
        }
        DocumentStoredFieldVisitor fieldSelector = new DocumentStoredFieldVisitor();
        reader.document(docID, fieldSelector);
        Document document = fieldSelector.getDocument();
        return getXdmNode(docID, document);
    }
    
    /**
     * @param docID
     * @return a document from the cache, or null if no document matching the docID is in the cache
     */
    public XdmNode get(long docID) {
        return cache.get(docID);
    }

    private XdmNode getXdmNode(long docID, Document document) throws IOException {
        String xml = document.get(xmlFieldName);
        String uri = document.get(uriFieldName);
        BytesRef binaryValue = document.getBinaryValue(xmlFieldName);
        byte[] bytes;
        if (binaryValue != null) {
            bytes = binaryValue.bytes;
        } else {
            bytes = null;
        }
        XdmNode node = createXdmNode (docID, uri, xml, bytes);
        document.removeField(xmlFieldName);
        node.getUnderlyingNode().getDocumentRoot().setUserData (Document.class.getName(), document);
        return node;
    }

    /**
     * Either the xml or the bytes parameter carries the document contents.  If the xml is non-null, it is assumed to be a serialized
     * XML document, and is parsed.  If the bytes is non-null and begins with a tinybin signature, it is interpreted as 
     * tiny binary.  Otherwise, the bytes are treated as a binary (non-XML) document.  If both xml and bytes are null, a
     * warning is logged and an empty binary document created.
     * 
     * @param docID The Solr/Lucene docid; or a shard/docid combo under Solr Cloud
     * @param uri The uri path (no scheme) of the document to create
     * @param xml The contents of the document, if an XML document
     * @param bytes The contents of the document
     * @return a new document made from the arguments
     */
    public XdmNode createXdmNode (long docID, String uri, String xml, byte[] bytes) {
        uri = "lux:/" + uri;
        DocIDNumberAllocator docIdAllocator = (DocIDNumberAllocator) config.getDocumentNumberAllocator();
        docIdAllocator.setNextDocID(docID);
        long t0 = System.nanoTime();
        XdmNode node = null;
        if (xml == null) {
            if (bytes == null) {
                // This is a document without the expected fields, as will happen, eg if we just connect to
                // some random database.
                LoggerFactory.getLogger(CachingDocReader.class).warn ("Document {} has no XML content", docID);
                bytes = new byte[0];
            }
            if (bytes.length > 4 && bytes[0] == 'T' && bytes[1] == 'I' && bytes[2] == 'N') {
                // An XML document stored in tiny binary format
                TinyBinary tb = new TinyBinary(bytes, TinyBinaryField.UTF8);
                node = new XdmNode (tb.getTinyDocument(config));
            } else {
                xml = "";
            }
        }
        if (node == null) {
            StreamSource source = new StreamSource(new StringReader(xml));
            source.setSystemId(uri);
            try {
                node = builder.build(source);
            } catch (SaxonApiException e) {
                // shouldn't normally happen since the document would generally have
                // been parsed when indexed.
                throw new LuxException(e);
            }
        }
        // associate the bytes with the xml stub (for all non-XML content)
        if (bytes != null && xml == null) {
            ((TinyDocumentImpl)node.getUnderlyingNode()).setUserData("_binaryDocument", bytes);
        }
        // doesn't seem to do what one might think:
        // ((TinyDocumentImpl) node.getUnderlyingNode()).setBaseURI(uri);
        ((TinyDocumentImpl) node.getUnderlyingNode()).setSystemId(uri);
        buildTime += (System.nanoTime() - t0);
        cache.put(docID, node);
        ++cacheMisses;
        return node;
    }

    /**
     * @return the number of items retrieved from the cache
     */
    public int getCacheHits() {
        return cacheHits;
    }

    /**
     * @return the number of items retrieved and added to the cache
     */
    public int getCacheMisses() {
        return cacheMisses;
    }

    /**
     * @return the total time spent building documents (in nanoseconds). This
     *         includes time spent parsing and constructing a Saxon
     *         NodeInfo/XdmNode.
     */
    public long getBuildTime() {
        return buildTime;
    }

    /**
     * Clears all cached documents.
     */
    public void clear() {
        cache.clear();
    }
    
    static class NodeCache extends java.util.LinkedHashMap {
        
        private long bytes;
        private final long maxbytes;

        public NodeCache (long maxbytes) {
            super((int) ((maxbytes / 100) * 4 / 3 + 1), 0.75f, true);
            this.maxbytes = maxbytes;
            this.bytes = 0;
        }
        
        @Override
        public XdmNode put (Long key, XdmNode value) {
            bytes += calculateSize (value);
            return super.put(key, value);
        }
        
        @Override
        protected boolean removeEldestEntry(Map.Entry eldest) {
            while (bytes > maxbytes) {
                remove(eldest.getKey());
                bytes -= calculateSize (eldest.getValue());
                for (Map.Entry entry : entrySet()) {
                    // get the next eldest
                    eldest = entry;
                    break;
                }
            }
            return false;
        }

        /**
         * @param value a document whose size is to be calculated
         * @return the size of a TinyBinary representation of the document, which should be pretty close to the actual
         * heap size used.
         */
        private long calculateSize(XdmNode value) {
            TinyTree tree = ((TinyDocumentImpl)value.getUnderlyingNode()).getTree();
            int nodeCount = tree.getNumberOfNodes();
            int attCount = tree.getNumberOfAttributes();
            int nsCount = tree.getNumberOfNamespaces();
            byte[] binary = (byte[]) ((TinyDocumentImpl)value.getUnderlyingNode()).getUserData("_binaryDocument");
            int binSize = binary == null ? 0 : binary.length;
            // gross estimate of the number of string pointers
            // Note: we don't count the size of the names in the name pool, on the assumption they will be shared
            // by a lot of documents.
            int stringLen = 12 + 2 * nodeCount + 2 * attCount + 2 * nsCount;
            // actually count the lengths of all the attributes
            CharSequence[] attValueArray = tree.getAttributeValueArray();
            for (int i = 0; i < attCount; i++) {
                stringLen += attValueArray[i].length();
            }
            return 36
                    + binSize
                    + nodeCount * 19
                    + attCount * 8
                    + nsCount * 8
                    + tree.getCharacterBuffer().length() * 2
                    + (tree.getCommentBuffer() == null ? 0 : tree.getCommentBuffer().length() * 2) 
                    + stringLen;
        }
    }

}


/*
 * This Source Code Form is subject to the terms of the Mozilla Public License,
 * v. 2.0. If a copy of the MPL was not distributed with this file, You can
 * obtain one at http://mozilla.org/MPL/2.0/.
 */