lux.functions.FieldTerms Maven / Gradle / Ivy
package lux.functions;
import lux.Evaluator;
import lux.index.IndexConfiguration;
import lux.index.field.XmlTextField;
import lux.solr.CloudQueryRequest;
import lux.solr.SolrQueryContext;
import lux.solr.XQueryComponent;
import lux.xpath.FunCall;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.lib.ExtensionFunctionCall;
import net.sf.saxon.lib.ExtensionFunctionDefinition;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.AtomicValue;
import net.sf.saxon.value.EmptySequence;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.TermsParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.LoggerFactory;
* function lux:field-terms($field-name as xs:string?, $start as xs:string?) as xs:anyAtomicItem*
* This function accepts the name of a Lucene field, and a starting value, and
* returns the sequence of terms drawn from the field, ordered according to its
* natural order, starting with the first term that is >= the starting value.
* If the $field-name argument is empty, the terms are drawn from the default
* field defined by the {@link IndexConfiguration}, generally the
* {@link XmlTextField}.
public class FieldTerms extends ExtensionFunctionDefinition {
public StructuredQName getFunctionQName() {
return new StructuredQName("lux", FunCall.LUX_NAMESPACE, "field-terms");
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_STRING, SequenceType.OPTIONAL_STRING };
public int getMinimumNumberOfArguments() {
return 0;
public int getMaximumNumberOfArguments() {
return 2;
public boolean trustResultType() {
return true;
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.ATOMIC_SEQUENCE;
public ExtensionFunctionCall makeCallExpression() {
return new FieldTermsCall();
class FieldTermsCall extends ExtensionFunctionCall {
public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
String fieldName = null, start = "";
if (arguments.length > 0) {
Item arg0 = arguments[0].head();
if (arg0 != null) {
fieldName = arg0.getStringValue();
if (arguments.length > 1) {
Item arg1 = arguments[1].head();
start = arg1 == null ? "" : arg1.getStringValue();
Evaluator eval = SearchBase.getEvaluator(context);
try {
if (fieldName == null) {
fieldName = eval.getCompiler().getIndexConfiguration().getDefaultFieldName();
if (fieldName == null) {
return EmptySequence.getInstance();
Term term = new Term(fieldName, start);
if (eval.getQueryContext() instanceof SolrQueryContext) {
XQueryComponent xqueryComponent = ((SolrQueryContext) eval.getQueryContext()).getQueryComponent();
if (xqueryComponent.getCurrentShards() != null) {
return new LazySequence (new SolrTermsIterator(eval, term));
return new LazySequence(new TermsIterator(eval, term));
} catch (IOException e) {
throw new XPathException("failed getting terms from field " + fieldName, e);
* Retrieves terms from the index using Solr's TermsComponent. Currently used only for cloud requests,
* but in the future we may want to use it to get expose Solr's Terms functionality, which is richer
* than the basic TermsEnum API in Lucene. Be aware thoughthat this iterator retrieves terms
* via Solr's HTTP API.
class SolrTermsIterator implements SequenceIterator {
private final Evaluator eval;
private Term term; // the requested field and starting position (inclusive)
private int offset; // the starting position of the current batch
private int pos; // the absolute position from the start of the entire iteration
private String current; // the last value returned
private XQueryComponent xqueryComponent;
private SolrQueryResponse response;
SolrTermsIterator(Evaluator eval, Term term) {
this.term = term;
this.eval = eval;
pos = -1;
offset = 0;
xqueryComponent = ((SolrQueryContext)eval.getQueryContext()).getQueryComponent();
public AtomicValue next() throws XPathException {
for (;;) {
if (response == null) {
getMoreTerms ();
NamedList> termFields = (NamedList>) response.getValues().get("terms");
NamedList> terms = (NamedList>) termFields.get(term.field());
if (terms.size() == 0) {
return null;
int idx = pos - offset;
if (idx >= terms.size()) {
response = null;
} else {
current = terms.getName(idx);
// Integer fieldTermCount = (Integer) terms.getVal(pos);
pos += 1;
return new StringValue(current);
private void getMoreTerms() {
SolrRequestHandler termsHandler = xqueryComponent.getCore().getRequestHandler("/terms");
if (termsHandler == null) {
LoggerFactory.getLogger(getClass()).error("No /terms handler configured; lux:field-terms giving up");
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(TermsParams.TERMS_FIELD, term.field());
if (current != null) {
params.add(TermsParams.TERMS_LOWER, current);
params.add(TermsParams.TERMS_LOWER_INCLUSIVE, "false");
offset = pos;
} else {
pos = 0;
params.add(TermsParams.TERMS_LOWER, term.text());
params.add(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_INDEX);
params.add(TermsParams.TERMS_LIMIT, Integer.toString(100));
params.add("distrib", "true");
params.add(ShardParams.SHARDS, StringUtils.join(xqueryComponent.getCurrentShards(), ","));
params.add(ShardParams.SHARDS_QT, "/terms"); // this gets passed to the shards to tell them what the request is
SolrQueryRequest req = new CloudQueryRequest(xqueryComponent.getCore(), params, null);
response = new SolrQueryResponse();
termsHandler.handleRequest(req, response);
public AtomicValue current() {
return new StringValue(current);
public int position() {
return pos;
public void close() {
public SequenceIterator getAnother() throws XPathException {
return new SolrTermsIterator(eval, term);
public int getProperties() {
return 0;
* Retrieves terms from the Lucene index directly, using TermsEnum.
class TermsIterator implements SequenceIterator {
private TermsEnum terms;
private final Evaluator eval;
private Term term;
private int pos;
private String current;
private String next;
TermsIterator(Evaluator eval, Term term) throws IOException {
this.term = term;
this.eval = eval;
pos = 0;
private void createTermsEnum(Term t) throws IOException {
String fieldName = t.field();
// TODO: get atomic sub readers and iterate values from those
/* From:
Note that the MultiFields approach entails a performance
hit on MultiReaders, as it must merge terms/docs/positions
on the fly. It's generally better to instead get the
sequential readers (use oal.util.ReaderUtil) and then step
through those readers yourself, if you can (this is how
Lucene drives searches).
Fields fields = MultiFields.getFields(eval.getSearcher().getIndexReader());
if (fields != null) {
Terms fieldTerms = fields.terms(fieldName);
if (fieldTerms != null) {
terms = fieldTerms.iterator(null);
if (t != null) {
if (terms.seekCeil(new BytesRef(t.text().getBytes("utf-8"))) != TermsEnum.SeekStatus.END) {
next = terms.term().utf8ToString();
public AtomicValue next() throws XPathException {
try {
if (next == null) {
pos = -1;
return null;
current = next;
BytesRef bytesRef =;
if (bytesRef == null) {
next = null;
} else {
next = bytesRef.utf8ToString();
return new net.sf.saxon.value.StringValue(current);
} catch (IOException e) {
throw new XPathException(e);
public AtomicValue current() {
return new net.sf.saxon.value.StringValue(current);
public int position() {
return pos;
public void close() {
public SequenceIterator getAnother() throws XPathException {
try {
return new TermsIterator(eval, term);
} catch (IOException e) {
throw new XPathException(e);
public int getProperties() {
return 0;
* This Source Code Form is subject to the terms of the Mozilla Public License,
* v. 2.0. If a copy of the MPL was not distributed with this file, You can
* obtain one at