org.apache.solr.handler.component.PhrasesIdentificationComponent Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr Core
There is a newer version: 9.6.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.LongSummaryStatistics;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;

import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.util.SolrPluginUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * A component that can be used in isolation, or in conjunction with {@link QueryComponent} to identify 
 * & score "phrases" found in the input string, based on shingles in indexed fields.
 *
 * 
 * The most common way to use this component is in conjunction with field that use 
 * {@link ShingleFilterFactory} on both the index and query analyzers.  
 * An example field type configuration would be something like this...
 * 
 *  * <fieldType name="phrases" class="solr.TextField" positionIncrementGap="100">
 *   <analyzer type="index">
 *     <tokenizer class="solr.StandardTokenizerFactory"/>
 *     <filter class="solr.LowerCaseFilterFactory"/>
 *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="3" outputUnigrams="true"/>
 *   </analyzer>
 *   <analyzer type="query">
 *     <tokenizer class="solr.StandardTokenizerFactory"/>
 *     <filter class="solr.LowerCaseFilterFactory"/>
 *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="7" outputUnigramsIfNoShingles="true" outputUnigrams="true"/>
 *   </analyzer>
 * </fieldType>
 * 
 * 
 * ...where the query analyzer's maxShingleSize="7" determines the maximum 
 * possible phrase length that can be hueristically deduced, the index analyzer's 
 * maxShingleSize="3" determines the accuracy of phrases identified.  The large the 
 * indexed maxShingleSize the higher the accuracy.  Both analyzers must include 
 * minShingleSize="2" outputUnigrams="true".
 * 
 * 
 * With a field type like this, one or more fields can be specified (with weights) via a 
 * phrases.fields param to request that this component identify possible phrases in the 
 * input q param, or an alternative phrases.q override param.  The identified
 * phrases will include their scores relative each field specified, as well an overal weighted score based
 * on the field weights provided by the client.  Higher score values indicate a greater confidence in the 
 * Phrase.
 * 
 * 
 * 
 * NOTE: In a distributed request, this component uses a single phase (piggy backing on the 
 * {@link ShardRequest#PURPOSE_GET_TOP_IDS} generated by {@link QueryComponent} if it is in use) to 
 * collect all field & shingle stats.  No "refinement" requests are used.
 * 
 *
 * @lucene.experimental
 */
public class PhrasesIdentificationComponent extends SearchComponent {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  /** The only shard purpose that will cause this component to do work & return data during shard req */
  public static final int SHARD_PURPOSE = ShardRequest.PURPOSE_GET_TOP_IDS;
  
  /** Name, also used as a request param to identify whether the user query concerns this component */
  public static final String COMPONENT_NAME = "phrases";

  // TODO: ideally these should live in a commons.params class?
  public static final String PHRASE_INPUT = "phrases.q";
  public static final String PHRASE_FIELDS = "phrases.fields";
  public static final String PHRASE_ANALYSIS_FIELD = "phrases.analysis.field";
  public static final String PHRASE_SUMMARY_PRE = "phrases.pre";
  public static final String PHRASE_SUMMARY_POST = "phrases.post";
  public static final String PHRASE_INDEX_MAXLEN = "phrases.maxlength.index";
  public static final String PHRASE_QUERY_MAXLEN = "phrases.maxlength.query";

  @Override
  public void prepare(ResponseBuilder rb) throws IOException {
    final SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }
    if (params.getBool(ShardParams.IS_SHARD, false)) {
      // only one stage/purpose where we should do any work on a shard
      if (0 == (SHARD_PURPOSE & params.getInt(ShardParams.SHARDS_PURPOSE, 0))) {
        return;
      }
    }

    // if we're still here, then we should parse & validate our input, 
    // putting it in the request context so our process method knows it should do work
    rb.req.getContext().put(this.getClass(), PhrasesContextData.parseAndValidateRequest(rb.req));
  }

  @Override
  public int distributedProcess(ResponseBuilder rb) {
    final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
    if (null == contextData) {
      // if prepare didn't give us anything to work with, then we should do nothing
      return ResponseBuilder.STAGE_DONE;
    }

    if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
      return ResponseBuilder.STAGE_EXECUTE_QUERY;
  
    } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
      // if we're being used in conjunction with QueryComponent, it should have already created
      // (in this staged) the only ShardRequest we need...
      for (ShardRequest sreq : rb.outgoing) {
        if (0 != (SHARD_PURPOSE & sreq.purpose) ) {
          return ResponseBuilder.STAGE_GET_FIELDS;
        }
      }
      // ...if we can't find it, then evidently we're being used in isolation,
      // and we need to create our own ShardRequest...
      ShardRequest sreq = new ShardRequest();
      sreq.purpose = SHARD_PURPOSE;
      sreq.params = new ModifiableSolrParams(rb.req.getParams());
      sreq.params.remove(ShardParams.SHARDS);
      rb.addRequest(this, sreq);
      return ResponseBuilder.STAGE_GET_FIELDS;
      
    } else if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
      // NOTE: we don't do any actual work in this stage, but we need to ensure that even if
      // we are being used in isolation w/o QueryComponent that SearchHandler "tracks" a STAGE_GET_FIELDS
      // so that finishStage(STAGE_GET_FIELDS) is called on us and we can add our merged results
      // (w/o needing extra code paths for merging phrase results when QueryComponent is/is not used)
      return ResponseBuilder.STAGE_DONE;
    }

    return ResponseBuilder.STAGE_DONE;
  }
  
  @Override
  public void finishStage(ResponseBuilder rb) {
    // NOTE: we don't do this after STAGE_EXECUTE_QUERY because if we're also being used with
    // QueryComponent, we don't want to add our results to the response until *after*
    // QueryComponent adds the main DocList
    
    final PhrasesContextData contextData = (PhrasesContextData) rb.req.getContext().get(this.getClass());
    if (null == contextData || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) {
      // if prepare didn't give us anything to work with, or this isn't our stage, then do nothing
      return;
    }
      
    // sanity check: the shard requests we use/piggy-back on should only hapen once per shard,
    // but let's future proof ourselves against the possibility that some shards might get/respond
    // to the same request "purpose" multiple times...
    final BitSet shardsHandled = new BitSet(rb.shards.length);
    
    // Collect Shard responses
    for (ShardRequest sreq : rb.finished) {
      if (0 != (sreq.purpose & SHARD_PURPOSE)) {
        for (ShardResponse shardRsp : sreq.responses) {
          final int shardNum = rb.getShardNum(shardRsp.getShard());
          if (! shardsHandled.get(shardNum)) {
            shardsHandled.set(shardNum);
            // shards.tolerant=true can cause nulls on exceptions/errors
            // if we don't get phrases/stats from a shard, just ignore that shard
            final SolrResponse rsp = shardRsp.getSolrResponse();
            if (null == rsp) continue;
            final NamedList