com.day.cq.search.suggest.builder.SearchIndexSuggestionExtractor Maven / Gradle / Ivy
/*
* Copyright 1997-2010 Day Management AG
* Barfuesserplatz 6, 4001 Basel, Switzerland
* All Rights Reserved.
*
* This software is the confidential and proprietary information of
* Day Management AG, ("Confidential Information"). You shall not
* disclose such Confidential Information and shall use it only in
* accordance with the terms of the license agreement you entered into
* with Day.
*/
package com.day.cq.search.suggest.builder;
import java.util.HashSet;
import java.util.Set;
import javax.jcr.Session;
import com.day.cq.search.suggest.SuggestionIndex;
import aQute.bnd.annotation.ProviderType;
/**
* A service interface for creating or updating a suggestion index based on
* repository content using the information from the Lucene search index. Will
* extract the most frequent terms and build suggestions based on them.
*
*
* Note: This only works with CRX / Jackrabbit as the underlying JCR
* repository, as this has to access the Lucene index directly to extract the
* terms; the search part of the JCR API does not cover this special case.
*/
@ProviderType
public interface SearchIndexSuggestionExtractor {
/**
* Creates or updates the {@link SuggestionIndex index} at the given path
* based on terms extracted from the Lucene search index of the underlying
* CRX/Jackrabbit repository. Uses the index of the workspace behind the
* given {@link Session}. Various configuration parameters, for example
* which fields from the index to use and what stop words list to use, can
* be passed via the {@link Options}.
*
*
* Note that this will take terms from the entire repository.
* Restricting it to content in a JCR subtree is not possible as this
* information is not stored in the Lucene index. A trick is to store the
* text to be indexed in a unique property (and maybe in a separate
* workspace or repository), which is not used by other content. This
* property would then be specified in {@link Options#properties}.
*
* @param session
* used to access the Lucene index for the workspace
* @param indexName
* name or path for the {@link SuggestionIndex} to create or
* update
* @param options
* configuration parameters
* @return the number of extracted terms
*/
int buildIndex(Session session, String indexName, Options options);
/**
* Data object for holding all configuration options for
* {@link SearchIndexSuggestionExtractor#buildIndex(Session, String, Options)}.
*/
static class Options implements Cloneable {
/**
* Maximum number of suggestions per term. See also
* {@link SuggestionIndex#index(java.util.List, int, boolean)}. Defaults
* to 10.
*/
public int maxSuggestions = 10;
/**
* Whether the index should be minimized. That means (longer) term
* prefixes that result in only one suggestion should not be indexed;
* only the first prefix that will result in this single suggestion
* would be stored, but not for the remaining letters of the suggested
* word. See also
* {@link SuggestionIndex#index(java.util.List, int, boolean)}. Defaults
* to true.
*/
public boolean minimizeIndex = true;
/**
* Maximum number of the most frequent terms to extract from the Lucene
* index. Use 0 or smaller for no limit. Defaults to 10000.
*/
public int maxTerms = 10000;
/**
* Minimum threshold for the frequence of a term to be included.
* Defaults to 2.
*/
public int minFrequency = 2;
/**
* Minimum length in characters of a term to be included. Defaults to 3.
*/
public int minTermLength = 3;
/**
* Path to a stop words file in the repository. These are terms that
* will be ignored if found in the Lucene index.
*
*
* This must be a text file containing one word per line; lines starting
* with "#" are ignored, as well as leading and trailing whitespace.
*
*
* To support multiple stop word files, please merge them in a new file
* and point to that one.
*
*
* Defaults to a built-in english stop words file at
* /libs/cq/search/content/suggest/stopwords/en.txt
.
*/
public String stopWordsFile = "/libs/cq/search/content/suggest/stopwords/en.txt";
/**
* Property value for {@link #properties} to use for addressing
* the node-scoped full text index.
*/
public static final String FULL_TEXT = ".";
/**
* JCR properties from which to extract the terms. Use
* {@link #FULL_TEXT} for the node-scoped full text index. Defaults to
* text
and title
.
*/
public Set properties = new HashSet();
/**
* Creates new Options.
*
* @param useDefaultProperties
* if the {@link #properties} should be initialized with
* default values or not
*/
public Options(boolean useDefaultProperties) {
if (useDefaultProperties) {
properties.add("text");
properties.add("title");
}
}
public Options clone() {
try {
return (Options) super.clone();
} catch (CloneNotSupportedException e) {
// ignore
return null;
}
}
}
}