org.apache.solr.handler.component.PhrasesIdentificationComponent Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.LongSummaryStatistics;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A component that can be used in isolation, or in conjunction with {@link QueryComponent} to
 * identify & score "phrases" found in the input string, based on shingles in indexed fields.
 *
 * The most common way to use this component is in conjunction with field that use {@link
 * ShingleFilterFactory} on both the index and query analyzers. An example
 * field type configuration would be something like this...
 *
 * 
 * <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
 *   <analyzer type="index">
 *     <tokenizer class="solr.StandardTokenizerFactory"/>
 *     <filter class="solr.LowerCaseFilterFactory"/>
 *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
 *   </analyzer>
 *   <analyzer type="query">
 *     <tokenizer class="solr.StandardTokenizerFactory"/>
 *     <filter class="solr.LowerCaseFilterFactory"/>
 *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
 *   </analyzer>
 * </fieldType>
 * 
 *
 * ...where the query analyzer's maxShingleSize="7" determines the
 * maximum possible phrase length that can be hueristically deduced, the index
 * analyzer's maxShingleSize="3" determines the accuracy of phrases identified. The
 * large the indexed maxShingleSize the higher the accuracy. Both analyzers must
 * include minShingleSize="2" outputUnigrams="true".
 *
 * 
With a field type like this, one or more fields can be specified (with weights) via a 
 * phrases.fields param to request that this component identify possible phrases in the input
 * q param, or an alternative phrases.q override param. The identified
 * phrases will include their scores relative each field specified, as well an overal weighted score
 * based on the field weights provided by the client. Higher score values indicate a greater
 * confidence in the Phrase.
 *
 * 
NOTE: In a distributed request, this component uses a single phase (piggy backing on
 * the {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use)
 * to collect all field & shingle stats. No "refinement" requests are used.
 *
 * @lucene.experimental
 */
public class PhrasesIdentificationComponent extends SearchComponent {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  /**
   * The only shard purpose that will cause this component to do work & return data during shard
   * req
   */
  public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS;

  /**
   * Name, also used as a request param to identify whether the user query concerns this component
   */
  public static final String COMPONENT_NAME = "phrases";

  // TODO: ideally these should live in a commons.params class?
  public static final String PHRASE_INPUT = "phrases.q";
  public static final String PHRASE_FIELDS = "phrases.fields";
  public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field";
  public static final String PHRASE_SUMMARY_PRE = "phrases.pre";
  public static final String PHRASE_SUMMARY_POST = "phrases.post";
  public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index";
  public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query";

  @Override
  public void prepare(ResponseBuilder rb) throws IOException {
    final SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }
    if (params.getBool(ShardParams.IS_SHARD, false)) {
      // only one stage/purpose where we should do any work on a shard
      if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) {
        return;
      }
    }

    // if we're still here, then we should parse & validate our input,
    // putting it in the request context so our process method knows it should do work
    rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req));
  }

  @Override
  public int distributedProcess(ResponseBuilder rb) {
    final PhrasesContextData contextData =
        (PhrasesContextData) rb.req.getContext().get(this.getClass());
    if (null == contextData) {
      // if prepare didn't give us anything to work with, then we should do nothing
      return ResponseBuilder.STAGE_DONE;
    }

    if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
      return ResponseBuilder.STAGE_EXECUTE_QUERY;

    } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
      // if we're being used in conjunction with QueryComponent, it should have already created
      // (in this staged) the only ShardRequest we need...
      for (ShardRequest sreq : rb.outgoing) {
        if (0 != (SHARD_PURPOSE & sreq.purpose)) {
          return ResponseBuilder.STAGE_GET_FIELDS;
        }
      }
      // ...if we can't find it, then evidently we're being used in isolation,
      // and we need to create our own ShardRequest...
      ShardRequest sreq = new ShardRequest();
      sreq.purpose = SHARD_PURPOSE;
      sreq.params = new ModifiableSolrParams(rb.req.getParams());
      sreq.params.remove(ShardParams.SHARDS);
      rb.addRequest(this, sreq);
      return ResponseBuilder.STAGE_GET_FIELDS;

    } else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
      // NOTE: we don't do any actual work in this stage, but we need to ensure that even if we are
      // being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS.
      // so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results
      // (w/o needing extra code paths for merging phrase results when QueryComponent is/is not
      // used)
      return ResponseBuilder.STAGE_DONE;
    }

    return ResponseBuilder.STAGE_DONE;
  }

  @Override
  public void finishStage(ResponseBuilder rb) {
    // NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with
    // QueryComponent, we don't want to add our results to the response until *after*
    // QueryComponent adds the main DocList

    final PhrasesContextData contextData =
        (PhrasesContextData) rb.req.getContext().get(this.getClass());
    if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) {
      // if prepare didn't give us anything to work with, or this isn't our stage, then do nothing
      return;
    }

    // sanity check: the shard requests we use/piggy-back on should only hapen once per shard,
    // but let's future proof ourselves against the possibility that some shards might get/respond
    // to the same request "purpose" multiple times...
    final BitSet shardsHandled = new BitSet(rb.shards.length);

    // Collect Shard responses
    for (ShardRequest sreq : rb.finished) {
      if (0 != (sreq.purpose & SHARD_PURPOSE)) {
        for (ShardResponse shardRsp : sreq.responses) {
          final int shardNum = rb.getShardNum(shardRsp.getShard());
          if (!shardsHandled.get(shardNum)) {
            shardsHandled.set(shardNum);
            // shards.tolerant=true can cause nulls on exceptions/errors
            // if we don't get phrases/stats from a shard, just ignore that shard
            final SolrResponse rsp = shardRsp.getSolrResponse();
            if (null == rsp) continue;
            final NamedList