org.apache.solr.handler.clustering.EngineParameters Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-clustering Show documentation
Apache Solr (module: clustering)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.clustering;

import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Objects;
import java.util.Set;
import org.apache.solr.common.params.SolrParams;

/**
 * {@link Engine} configuration parameters (and other parameters that may tweak clustering
 * algorithms on a per-request basis).
 *
 * @lucene.experimental
 */
public final class EngineParameters implements Cloneable {
  /** Common prefix for configuration of engine settings. */
  private static final String PARAM_PREFIX = "clustering.";

  /**
   * @see #algorithmName()
   */
  public static final String PARAM_ALGORITHM = PARAM_PREFIX + "algorithm";

  /**
   * @see #maxLabels()
   */
  public static final String PARAM_MAX_LABELS = PARAM_PREFIX + "maxLabels";

  /**
   * @see #includeSubclusters()
   */
  public static final String PARAM_INCLUDE_SUBCLUSTERS = PARAM_PREFIX + "includeSubclusters";

  /**
   * @see #includeOtherTopics()
   */
  public static final String PARAM_INCLUDE_OTHER_TOPICS = PARAM_PREFIX + "includeOtherTopics";

  /**
   * @see #language()
   */
  public static final String PARAM_LANGUAGE = PARAM_PREFIX + "language";

  /**
   * @see #languageField()
   */
  public static final String PARAM_LANGUAGE_FIELD = PARAM_PREFIX + "languageField";

  /**
   * @see #resources()
   */
  public static final String PARAM_RESOURCES = PARAM_PREFIX + "resources";

  /**
   * @see #fields()
   */
  public static final String PARAM_FIELDS = PARAM_PREFIX + "fields";

  /**
   * @see #preferQueryContext()
   */
  public static final String PARAM_PREFER_QUERY_CONTEXT = PARAM_PREFIX + "preferQueryContext";

  /**
   * @see #contextSize()
   */
  public static final String PARAM_CONTEXT_SIZE = PARAM_PREFIX + "contextSize";

  /**
   * @see #contextCount()
   */
  public static final String PARAM_CONTEXT_COUNT = PARAM_PREFIX + "contextCount";

  /**
   * @see #PARAM_MAX_LABELS
   */
  private int maxLabels = Integer.MAX_VALUE;

  /**
   * @see #PARAM_INCLUDE_SUBCLUSTERS
   */
  private boolean includeSubclusters = true;

  /**
   * @see #PARAM_INCLUDE_OTHER_TOPICS
   */
  private boolean includeOtherTopics = true;

  /**
   * @see #PARAM_ALGORITHM
   */
  private String algorithmName;

  /**
   * @see #PARAM_RESOURCES
   */
  private String resources;

  /**
   * @see #PARAM_LANGUAGE
   */
  private String language = "English";

  /**
   * @see #PARAM_LANGUAGE_FIELD
   */
  private String languageField;

  /**
   * @see #PARAM_PREFER_QUERY_CONTEXT
   */
  private boolean preferQueryContext;

  /**
   * @see #PARAM_CONTEXT_SIZE
   */
  private int contextSize = 80 * 4;

  /**
   * @see #PARAM_CONTEXT_COUNT
   */
  private int contextCount = 3;

  /**
   * @see #PARAM_FIELDS
   */
  private LinkedHashSet fields = new LinkedHashSet<>();

  /** Non-engine configuration parameters (algorithm parameters). */
  private LinkedHashMap otherParameters = new LinkedHashMap<>();

  /**
   * Unique-value document identifier field. This is required for clustering since clusters only
   * reference documents by their ID field's value.
   */
  private String docIdField;

  EngineParameters(SolrParams params) {
    extractFrom(params);
  }

  /** Extract parameter values from the given {@link SolrParams}. */
  private EngineParameters extractFrom(SolrParams params) {
    params.stream()
        .forEachOrdered(
            e -> {
              switch (e.getKey()) {
                case PARAM_MAX_LABELS:
                  maxLabels = params.getInt(PARAM_MAX_LABELS);
                  break;
                case PARAM_INCLUDE_SUBCLUSTERS:
                  includeSubclusters = params.getBool(PARAM_INCLUDE_SUBCLUSTERS);
                  break;
                case PARAM_INCLUDE_OTHER_TOPICS:
                  includeOtherTopics = params.getBool(PARAM_INCLUDE_OTHER_TOPICS);
                  break;
                case PARAM_ALGORITHM:
                  algorithmName = params.get(PARAM_ALGORITHM);
                  break;
                case PARAM_RESOURCES:
                  resources = params.get(PARAM_RESOURCES);
                  break;
                case PARAM_LANGUAGE:
                  language = params.get(PARAM_LANGUAGE);
                  break;
                case PARAM_LANGUAGE_FIELD:
                  languageField = params.get(PARAM_LANGUAGE_FIELD);
                  break;
                case PARAM_PREFER_QUERY_CONTEXT:
                  preferQueryContext = params.getBool(PARAM_PREFER_QUERY_CONTEXT);
                  break;
                case PARAM_CONTEXT_COUNT:
                  contextCount = params.getPrimitiveInt(PARAM_CONTEXT_COUNT);
                  break;
                case PARAM_CONTEXT_SIZE:
                  contextSize = params.getPrimitiveInt(PARAM_CONTEXT_SIZE);
                  break;
                case PARAM_FIELDS:
                  fields.addAll(Arrays.asList(params.get(PARAM_FIELDS).split("[,]\\s*")));
                  break;
                default:
                  // Unrecognized parameter. Preserve it.
                  String[] value = e.getValue();
                  if (value != null) {
                    if (value.length == 1) {
                      otherParameters.put(e.getKey(), value[0]);
                    } else {
                      otherParameters.put(e.getKey(), String.join(", ", value));
                    }
                  }
                  break;
              }
            });
    return this;
  }

  /**
   * @return Maximum number of returned cluster labels (even if the algorithm returns more).
   */
  int maxLabels() {
    return maxLabels;
  }

  /**
   * @return If {@code true}, include subclusters in response (if the algorithm produces
   *     hierarchical clustering).
   */
  boolean includeSubclusters() {
    return includeSubclusters;
  }

  /**
   * @return If {@code true}, include a synthetic cluster called "Other Topics" that consists of all
   *     documents not assigned to any other cluster.
   */
  boolean includeOtherTopics() {
    return includeOtherTopics;
  }

  /**
   * @return Name of the clustering algorithm to use (as loaded via the service * extension point
   *     {@link org.carrot2.clustering.ClusteringAlgorithm}).
   */
  String algorithmName() {
    return algorithmName;
  }

  /**
   * @return Return Solr component-configuration relative language resources path.
   */
  String resources() {
    return resources;
  }

  /**
   * @return Name of the default language to use for clustering. The corresponding {@link
   *     org.carrot2.language.LanguageComponents} must be available (loaded via service provider
   *     extension).
   */
  String language() {
    return language;
  }

  /**
   * @return Name of the field that carries each document's language. {@code null} value means all
   *     documents will be clustered according to the default {@link #language()}. If not {@code
   *     null} and the document's field has a missing value, it will be clustered using the default
   *     {@link #language()} as well.
   */
  String languageField() {
    return languageField;
  }

  /**
   * @return Names of all fields whose textual content will be passed to the clustering engine.
   *     Comma or space separated.
   */
  Set fields() {
    return fields;
  }

  /**
   * @return Returns {@code true} if clustering should try to extract context fragments around the
   *     matching query regions rather than use full field content. Such context snippets typically
   *     cluster well because they carry a more compact and query-related information.
   */
  boolean preferQueryContext() {
    return preferQueryContext;
  }

  /**
   * @return Returns the maximum query context window to use if {@link #preferQueryContext()} is
   *     {@code true}.
   */
  int contextSize() {
    return contextSize;
  }

  /**
   * @return Returns the maximum number of different, non-contiguous query context snippets from a
   *     single field if {@link #preferQueryContext()} is {@code true}.
   */
  int contextCount() {
    return contextCount;
  }

  LinkedHashMap otherParameters() {
    return otherParameters;
  }

  @Override
  protected EngineParameters clone() {
    try {
      EngineParameters clone = (EngineParameters) super.clone();
      clone.otherParameters = new LinkedHashMap<>(this.otherParameters);
      clone.fields.addAll(this.fields);
      return clone;
    } catch (CloneNotSupportedException e) {
      throw new RuntimeException(e);
    }
  }

  /**
   * @return Return a copy of the argument with any parameters present in {@code params} overriding
   *     this object defaults.
   */
  EngineParameters derivedFrom(SolrParams params) {
    EngineParameters cloned = this.clone();
    cloned.extractFrom(params);
    return cloned;
  }

  String docIdField() {
    return Objects.requireNonNull(docIdField);
  }

  void setDocIdField(String docIdField) {
    this.docIdField = Objects.requireNonNull(docIdField);
  }

  Set getFieldsToLoad() {
    Set fields = new LinkedHashSet<>(fields());
    fields.add(docIdField());
    String languageField = languageField();
    if (languageField != null && !languageField.isBlank()) {
      fields.add(languageField);
    }
    return fields;
  }
}