lux.functions.FieldTerms Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lux Show documentation
Lux XML search engine
There is a newer version: 1.1.0
package lux.functions;

import java.io.IOException;

import lux.Evaluator;
import lux.index.IndexConfiguration;
import lux.index.field.XmlTextField;
import lux.solr.CloudQueryRequest;
import lux.solr.SolrQueryContext;
import lux.solr.XQueryComponent;
import lux.xpath.FunCall;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.lib.ExtensionFunctionCall;
import net.sf.saxon.lib.ExtensionFunctionDefinition;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.LazySequence;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.om.SequenceIterator;
import net.sf.saxon.om.StructuredQName;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.AtomicValue;
import net.sf.saxon.value.EmptySequence;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.TermsParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.LoggerFactory;

/**
 * function lux:field-terms($field-name as xs:string?, $start as xs:string?) as xs:anyAtomicItem*
 * 
 * This function accepts the name of a Lucene field, and a starting value, and
 * returns the sequence of terms drawn from the field, ordered according to its
 * natural order, starting with the first term that is >= the starting value.
 * 
 * 
 * If the $field-name argument is empty, the terms are drawn from the default
 * field defined by the {@link IndexConfiguration}, generally the
 * {@link XmlTextField}.
 * 
 */
public class FieldTerms extends ExtensionFunctionDefinition {

    @Override
    public StructuredQName getFunctionQName() {
        return new StructuredQName("lux", FunCall.LUX_NAMESPACE, "field-terms");
    }

    @Override
    public SequenceType[] getArgumentTypes() {
        return new SequenceType[] { SequenceType.OPTIONAL_STRING, SequenceType.OPTIONAL_STRING };
    }

    @Override
    public int getMinimumNumberOfArguments() {
        return 0;
    }

    @Override
    public int getMaximumNumberOfArguments() {
        return 2;
    }

    @Override
    public boolean trustResultType() {
        return true;
    }

    @Override
    public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
        return SequenceType.ATOMIC_SEQUENCE;
    }

    @Override
    public ExtensionFunctionCall makeCallExpression() {
        return new FieldTermsCall();
    }

    class FieldTermsCall extends ExtensionFunctionCall {

        @Override
        public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
            String fieldName = null, start = "";
            if (arguments.length > 0) {
                Item arg0 = arguments[0].head();
                if (arg0 != null) {
                	fieldName = arg0.getStringValue();
                }
                if (arguments.length > 1) {
                    Item arg1 = arguments[1].head();
                    start = arg1 == null ? "" : arg1.getStringValue();
                }
            }
            Evaluator eval = SearchBase.getEvaluator(context);
            try {
                if (fieldName == null) {
                    fieldName = eval.getCompiler().getIndexConfiguration().getDefaultFieldName();
                    if (fieldName == null) {
                        return EmptySequence.getInstance();
                    }
                }
                Term term = new Term(fieldName, start);
                if (eval.getQueryContext() instanceof SolrQueryContext) {
                    XQueryComponent xqueryComponent = ((SolrQueryContext) eval.getQueryContext()).getQueryComponent();
                    if (xqueryComponent.getCurrentShards() != null) {
                        return new LazySequence (new SolrTermsIterator(eval, term));
                    }
                }
                return new LazySequence(new TermsIterator(eval, term));
            } catch (IOException e) {
                throw new XPathException("failed getting terms from field " + fieldName, e);
            }
        }

    }

    /**
     * Retrieves terms from the index using Solr's TermsComponent.  Currently used only for cloud requests,
     * but in the future we may want to use it to get expose Solr's Terms functionality, which is richer 
     * than the basic TermsEnum API in Lucene.  Be aware thoughthat this iterator retrieves terms 
     * via Solr's HTTP API.
     */
    class SolrTermsIterator implements SequenceIterator {
        private final Evaluator eval;
        private Term term;  // the requested field and starting position (inclusive)
        private int offset; // the starting position of the current batch
        private int pos;    // the absolute position from the start of the entire iteration
        private String current; // the last value returned
        private XQueryComponent xqueryComponent;
        private SolrQueryResponse response;
        
        SolrTermsIterator(Evaluator eval, Term term) {
            this.term = term;
            this.eval = eval;
            pos = -1;
            offset = 0;
            xqueryComponent = ((SolrQueryContext)eval.getQueryContext()).getQueryComponent();
        }

        @Override
        public AtomicValue next() throws XPathException {
            for (;;) {
                if (response == null) {
                    getMoreTerms ();
                }
                NamedList termFields = (NamedList) response.getValues().get("terms");
                NamedList terms = (NamedList) termFields.get(term.field());
                if (terms.size() == 0) {
                    return null;
                }
                int idx = pos - offset;
                if (idx >= terms.size()) {
                    response = null;
                } else {
                    current = terms.getName(idx);
                    // Integer fieldTermCount = (Integer) terms.getVal(pos);
                    pos += 1;
                    return new StringValue(current);
                }
            }
        }

        private void getMoreTerms() {
            SolrRequestHandler termsHandler = xqueryComponent.getCore().getRequestHandler("/terms");
            if (termsHandler == null) {
                LoggerFactory.getLogger(getClass()).error("No /terms handler configured; lux:field-terms giving up");
                return;
            }
            ModifiableSolrParams params = new ModifiableSolrParams();
            params.add(TermsParams.TERMS_FIELD, term.field());
            if (current != null) {
                params.add(TermsParams.TERMS_LOWER, current);
                params.add(TermsParams.TERMS_LOWER_INCLUSIVE, "false");
                offset = pos;
            } else {
                pos = 0;
                params.add(TermsParams.TERMS_LOWER, term.text());
            }
            params.add(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_INDEX);
            params.add(TermsParams.TERMS_LIMIT, Integer.toString(100));
            params.add("distrib", "true");
            xqueryComponent.getCurrentShards();
            params.add(ShardParams.SHARDS, StringUtils.join(xqueryComponent.getCurrentShards(), ","));
            params.add(ShardParams.SHARDS_QT, "/terms"); // this gets passed to the shards to tell them what the request is
            SolrQueryRequest req = new CloudQueryRequest(xqueryComponent.getCore(), params, null);
            response = new SolrQueryResponse();
            termsHandler.handleRequest(req, response);
        }

        @Override
        public AtomicValue current() {
            return new StringValue(current);
        }

        @Override
        public int position() {
            return pos;
        }

        @Override
        public void close() {

        }

        @Override
        public SequenceIterator getAnother() throws XPathException {
            return new SolrTermsIterator(eval, term);
        }

        @Override
        public int getProperties() {
            return 0;
        }

    }

    /**
     * Retrieves terms from the Lucene index directly, using TermsEnum.
     */
    class TermsIterator implements SequenceIterator {
        private TermsEnum terms;
        private final Evaluator eval;
        private Term term;
        private int pos;
        private String current;
        private String next;

        TermsIterator(Evaluator eval, Term term) throws IOException {
            this.term = term;
            this.eval = eval;
            pos = 0;
            createTermsEnum(term);
        }

        private void createTermsEnum(Term t) throws IOException {
            String fieldName = t.field();
            // TODO: get atomic sub readers and iterate values from those 
            /*  From: http://lucene.apache.org/core/4_0_0-BETA/MIGRATE.html

                Note that the MultiFields approach entails a performance
                hit on MultiReaders, as it must merge terms/docs/positions
                on the fly. It's generally better to instead get the
                sequential readers (use oal.util.ReaderUtil) and then step
                through those readers yourself, if you can (this is how
                Lucene drives searches).
            */
            Fields fields = MultiFields.getFields(eval.getSearcher().getIndexReader());
            if (fields != null) {
                Terms fieldTerms = fields.terms(fieldName);
                if (fieldTerms != null) {
                    terms = fieldTerms.iterator(null);
                    if (t != null) {
                        if (terms.seekCeil(new BytesRef(t.text().getBytes("utf-8"))) != TermsEnum.SeekStatus.END) {
                            next = terms.term().utf8ToString();
                        }
                    }
                }
            }
        }

        @Override
        public AtomicValue next() throws XPathException {
            try {
                if (next == null) {
                    pos = -1;
                    return null;
                }
                ++pos;
                current = next;
                BytesRef bytesRef = terms.next();
                if (bytesRef == null) {
                    next = null;
                } else {
                    next = bytesRef.utf8ToString();
                }
                return new net.sf.saxon.value.StringValue(current);
            } catch (IOException e) {
                throw new XPathException(e);
            }
        }

        @Override
        public AtomicValue current() {
            return new net.sf.saxon.value.StringValue(current);
        }

        @Override
        public int position() {
            return pos;
        }

        @Override
        public void close() {
        }

        @Override
        public SequenceIterator getAnother() throws XPathException {
            try {
                return new TermsIterator(eval, term);
            } catch (IOException e) {
                throw new XPathException(e);
            }
        }

        @Override
        public int getProperties() {
            return 0;
        }

    }

}

/*
 * This Source Code Form is subject to the terms of the Mozilla Public License,
 * v. 2.0. If a copy of the MPL was not distributed with this file, You can
 * obtain one at http://mozilla.org/MPL/2.0/.
 */