org.apache.solr.handler.tagger.TaggerRequestHandler Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * This software was produced for the U. S. Government
 * under Contract No. W15P7T-11-C-F600, and is
 * subject to the Rights in Noncommercial Computer Software
 * and Noncommercial Computer Software Documentation
 * Clause 252.227-7014 (JUN 1995)
 *
 * Copyright 2013 The MITRE Corporation. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.tagger;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import javax.xml.stream.XMLStreamException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.CollectionUtil;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.QParser;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.security.AuthorizationContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Scans posted text, looking for matching strings in the Solr index. The public static final String
 * members are request parameters. This handler is also called the "SolrTextTagger".
 *
 * @since 7.4.0
 */
public class TaggerRequestHandler extends RequestHandlerBase {

  /** Request parameter. */
  public static final String OVERLAPS = "overlaps";

  /** Request parameter. */
  public static final String TAGS_LIMIT = "tagsLimit";

  /** Request parameter. */
  public static final String MATCH_TEXT = "matchText";

  /** Request parameter. */
  public static final String SKIP_ALT_TOKENS = "skipAltTokens";

  /** Request parameter. */
  public static final String IGNORE_STOPWORDS = "ignoreStopwords";

  /** Request parameter. */
  public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  @Override
  public String getDescription() {
    return "Processes input text to find matching tokens stored in the index.";
  }

  @Override
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {

    // --Read params
    final String indexedField = req.getParams().get("field");
    if (indexedField == null) throw new RuntimeException("required param 'field'");

    final TagClusterReducer tagClusterReducer =
        chooseTagClusterReducer(req.getParams().get(OVERLAPS));
    final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
    final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
    final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
    final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
    if (idSchemaField == null) {
      throw new SolrException(
          SolrException.ErrorCode.SERVER_ERROR,
          "The tagger requires a" + "uniqueKey in the schema."); // TODO this could be relaxed
    }
    final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
    final boolean ignoreStopWords =
        req.getParams().getBool(IGNORE_STOPWORDS, fieldHasIndexedStopFilter(indexedField, req));

    // --Get posted data
    Reader inputReader = null;
    Iterable streams = req.getContentStreams();
    if (streams != null) {
      Iterator iter = streams.iterator();
      if (iter.hasNext()) {
        inputReader = iter.next().getReader();
      }
      if (iter.hasNext()) {
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            getClass().getSimpleName()
                + " does not support multiple ContentStreams"); // TODO support bulk tagging?
      }
    }
    if (inputReader == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          getClass().getSimpleName() + " requires text to be POSTed to it");
    }

    // We may or may not need to read the input into a string
    final InputStringLazy inputStringFuture = new InputStringLazy(inputReader);

    final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture);

    final String inputString; // only populated if needed
    if (addMatchText || inputStringFuture.inputString != null) {
      // Read the input fully into a String buffer that we'll need later,
      // then replace the input with a reader wrapping the buffer.
      inputString = inputStringFuture.call();
      inputReader.close();
      inputReader = new StringReader(inputString);
    } else {
      inputString = null; // not used
    }

    final SolrIndexSearcher searcher = req.getSearcher();
    final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
    final List> tags = new ArrayList<>(2000);

    try {
      Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
      try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
        Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
        if (terms != null) {
          Tagger tagger =
              new Tagger(
                  terms,
                  computeDocCorpus(req),
                  tokenStream,
                  tagClusterReducer,
                  skipAltTokens,
                  ignoreStopWords) {
                @Override
                protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
                  if (tags.size() >= tagsLimit) return;
                  if (offsetCorrector != null) {
                    int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
                    if (offsetPair == null) {
                      log.debug(
                          "Discarded offsets [{}, {}] because couldn't balance XML.",
                          startOffset,
                          endOffset);
                      return;
                    }
                    startOffset = offsetPair[0];
                    endOffset = offsetPair[1];
                  }

                  SimpleOrderedMap