All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.tagger.TaggerRequestHandler Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import javax.xml.stream.XMLStreamException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.CollectionUtil;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.QParser;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.security.AuthorizationContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Scans posted text, looking for matching strings in the Solr index. The public static final String
 * members are request parameters. This handler is also called the "SolrTextTagger".
 *
 * @since 7.4.0
 */
public class TaggerRequestHandler extends RequestHandlerBase {

  /** Request parameter. */
  public static final String OVERLAPS = "overlaps";

  /** Request parameter. */
  public static final String TAGS_LIMIT = "tagsLimit";

  /** Request parameter. */
  public static final String MATCH_TEXT = "matchText";

  /** Request parameter. */
  public static final String SKIP_ALT_TOKENS = "skipAltTokens";

  /** Request parameter. */
  public static final String IGNORE_STOPWORDS = "ignoreStopwords";

  /** Request parameter. */
  public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  @Override
  public String getDescription() {
    return "Processes input text to find matching tokens stored in the index.";
  }

  @Override
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {

    // --Read params
    final String indexedField = req.getParams().get("field");
    if (indexedField == null) throw new RuntimeException("required param 'field'");

    final TagClusterReducer tagClusterReducer =
        chooseTagClusterReducer(req.getParams().get(OVERLAPS));
    final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
    final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
    final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
    final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
    if (idSchemaField == null) {
      throw new SolrException(
          SolrException.ErrorCode.SERVER_ERROR,
          "The tagger requires a" + "uniqueKey in the schema."); // TODO this could be relaxed
    }
    final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
    final boolean ignoreStopWords =
        req.getParams().getBool(IGNORE_STOPWORDS, fieldHasIndexedStopFilter(indexedField, req));

    // --Get posted data
    Reader inputReader = null;
    Iterable streams = req.getContentStreams();
    if (streams != null) {
      Iterator iter = streams.iterator();
      if (iter.hasNext()) {
        inputReader = iter.next().getReader();
      }
      if (iter.hasNext()) {
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            getClass().getSimpleName()
                + " does not support multiple ContentStreams"); // TODO support bulk tagging?
      }
    }
    if (inputReader == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          getClass().getSimpleName() + " requires text to be POSTed to it");
    }

    // We may or may not need to read the input into a string
    final InputStringLazy inputStringFuture = new InputStringLazy(inputReader);

    final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture);

    final String inputString; // only populated if needed
    if (addMatchText || inputStringFuture.inputString != null) {
      // Read the input fully into a String buffer that we'll need later,
      // then replace the input with a reader wrapping the buffer.
      inputString = inputStringFuture.call();
      inputReader.close();
      inputReader = new StringReader(inputString);
    } else {
      inputString = null; // not used
    }

    final SolrIndexSearcher searcher = req.getSearcher();
    final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
    final List> tags = new ArrayList<>(2000);

    try {
      Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
      try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
        Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
        if (terms != null) {
          Tagger tagger =
              new Tagger(
                  terms,
                  computeDocCorpus(req),
                  tokenStream,
                  tagClusterReducer,
                  skipAltTokens,
                  ignoreStopWords) {
                @Override
                protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
                  if (tags.size() >= tagsLimit) return;
                  if (offsetCorrector != null) {
                    int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
                    if (offsetPair == null) {
                      log.debug(
                          "Discarded offsets [{}, {}] because couldn't balance XML.",
                          startOffset,
                          endOffset);
                      return;
                    }
                    startOffset = offsetPair[0];
                    endOffset = offsetPair[1];
                  }

                  SimpleOrderedMap tag = new SimpleOrderedMap<>();
                  tag.add("startOffset", startOffset);
                  tag.add("endOffset", endOffset);
                  if (addMatchText)
                    tag.add("matchText", inputString.substring(startOffset, endOffset));
                  // below caches, and also flags matchDocIdsBS
                  tag.add("ids", lookupSchemaDocIds(docIdsKey));
                  tags.add(tag);
                }

                Map> docIdsListCache = CollectionUtil.newHashMap(2000);

                ValueSourceAccessor uniqueKeyCache =
                    new ValueSourceAccessor(
                        searcher, idSchemaField.getType().getValueSource(idSchemaField, null));

                private List lookupSchemaDocIds(Object docIdsKey) {
                  List schemaDocIds = docIdsListCache.get(docIdsKey);
                  if (schemaDocIds != null) return schemaDocIds;
                  IntsRef docIds = lookupDocIds(docIdsKey);
                  // translate lucene docIds to schema ids
                  schemaDocIds = new ArrayList<>(docIds.length);
                  for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
                    int docId = docIds.ints[i];
                    assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?";
                    matchDocIdsBS.set(docId); // also, flip docid in bitset
                    try {
                      schemaDocIds.add(uniqueKeyCache.objectVal(docId)); // translates here
                    } catch (IOException e) {
                      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
                    }
                  }
                  assert !schemaDocIds.isEmpty();

                  docIdsListCache.put(docIds, schemaDocIds);
                  return schemaDocIds;
                }
              };
          tagger.enableDocIdsCache(2000); // TODO configurable
          tagger.process();
        }
      }
    } finally {
      inputReader.close();
    }
    rsp.add("tagsCount", tags.size());
    rsp.add("tags", tags);

    rsp.setReturnFields(new SolrReturnFields(req));

    // Solr's standard name for matching docs in response
    rsp.add("response", getDocList(rows, matchDocIdsBS));
  }

  @Override
  public Name getPermissionName(AuthorizationContext request) {
    return Name.READ_PERM;
  }

  private static class InputStringLazy implements Callable {
    final Reader inputReader;
    String inputString;

    InputStringLazy(Reader inputReader) {
      this.inputReader = inputReader;
    }

    @Override
    public String call() throws IOException {
      if (inputString == null) {
        inputString = StrUtils.stringFromReader(inputReader);
      }
      return inputString;
    }
  }

  protected OffsetCorrector getOffsetCorrector(
      SolrParams params, Callable inputStringProvider) throws Exception {
    final boolean xmlOffsetAdjust = params.getBool(XML_OFFSET_ADJUST, false);
    if (!xmlOffsetAdjust) {
      return null;
    }
    try {
      return new XmlOffsetCorrector(inputStringProvider.call());
    } catch (XMLStreamException e) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "Expecting XML but wasn't: " + e, e);
    }
  }

  private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException {
    // Now we must supply a Solr DocList and add it to the response.
    //  Typically this is gotten via a SolrIndexSearcher.search(), but in this case we
    //  know exactly what documents to return, the order doesn't matter nor does
    //  scoring.
    //  Ideally an implementation of DocList could be directly implemented off
    //  of a BitSet, but there are way too many methods to implement for a minor
    //  payoff.
    int matchDocs = matchDocIdsBS.cardinality();
    int[] docIds = new int[Math.min(rows, matchDocs)];
    DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1);
    for (int i = 0; i < docIds.length; i++) {
      docIds[i] = docIdIter.nextDoc();
    }
    return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f, TotalHits.Relation.EQUAL_TO);
  }

  private TagClusterReducer chooseTagClusterReducer(String overlaps) {
    TagClusterReducer tagClusterReducer;
    if (overlaps == null || overlaps.equals("NO_SUB")) {
      tagClusterReducer = TagClusterReducer.NO_SUB;
    } else if (overlaps.equals("ALL")) {
      tagClusterReducer = TagClusterReducer.ALL;
    } else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) {
      tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "unknown tag overlap mode: " + overlaps);
    }
    return tagClusterReducer;
  }

  /**
   * The set of documents matching the provided 'fq' (filter query). Don't include deleted docs
   * either. If null is returned, then all docs are available.
   */
  private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException {
    final String[] corpusFilterQueries = req.getParams().getParams("fq");
    final SolrIndexSearcher searcher = req.getSearcher();
    final Bits docBits;
    if (corpusFilterQueries != null && corpusFilterQueries.length > 0) {
      List filterQueries = new ArrayList<>(corpusFilterQueries.length);
      for (String corpusFilterQuery : corpusFilterQueries) {
        QParser qParser = QParser.getParser(corpusFilterQuery, null, req);
        try {
          filterQueries.add(qParser.parse());
        } catch (SyntaxError e) {
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
        }
      }

      final DocSet docSet = searcher.getDocSet(filterQueries); // hopefully in the cache

      docBits = docSet.getBits();
    } else {
      docBits = searcher.getSlowAtomicReader().getLiveDocs();
    }
    return docBits;
  }

  private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
    FieldType fieldType = req.getSchema().getFieldType(field);
    Analyzer analyzer = fieldType.getIndexAnalyzer(); // index analyzer
    if (analyzer instanceof TokenizerChain) {
      TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
      TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
      for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
        if (tokenFilterFactory instanceof StopFilterFactory) return true;
      }
    }
    return false;
  }

  /** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
  static class ValueSourceAccessor {
    private final List readerContexts;
    private final ValueSource valueSource;
    private final Map fContext;
    private final FunctionValues[] functionValuesPerSeg;
    private final int[] functionValuesDocIdPerSeg;

    ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
      readerContexts = searcher.getIndexReader().leaves();
      this.valueSource = valueSource;
      fContext = ValueSource.newContext(searcher);
      functionValuesPerSeg = new FunctionValues[readerContexts.size()];
      functionValuesDocIdPerSeg = new int[readerContexts.size()];
    }

    Object objectVal(int topDocId) throws IOException {
      // lookup segment level stuff:
      int segIdx = ReaderUtil.subIndex(topDocId, readerContexts);
      LeafReaderContext rcontext = readerContexts.get(segIdx);
      int segDocId = topDocId - rcontext.docBase;
      // unfortunately Lucene 7.0 requires forward only traversal (with no reset method).
      //   So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-(
      FunctionValues functionValues = functionValuesPerSeg[segIdx];
      if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) {
        functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext);
      }
      functionValuesDocIdPerSeg[segIdx] = segDocId;

      // get value:
      return functionValues.objectVal(segDocId);
    }
  }
}