All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.day.cq.search.suggest.builder.SearchIndexSuggestionExtractor Maven / Gradle / Ivy

/*
 * Copyright 1997-2010 Day Management AG
 * Barfuesserplatz 6, 4001 Basel, Switzerland
 * All Rights Reserved.
 *
 * This software is the confidential and proprietary information of
 * Day Management AG, ("Confidential Information"). You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Day.
 */
package com.day.cq.search.suggest.builder;

import java.util.HashSet;
import java.util.Set;

import javax.jcr.Session;

import com.day.cq.search.suggest.SuggestionIndex;

import aQute.bnd.annotation.ProviderType;

/**
 * A service interface for creating or updating a suggestion index based on
 * repository content using the information from the Lucene search index. Will
 * extract the most frequent terms and build suggestions based on them.
 * 
 * 

* Note: This only works with CRX / Jackrabbit as the underlying JCR * repository, as this has to access the Lucene index directly to extract the * terms; the search part of the JCR API does not cover this special case. */ @ProviderType public interface SearchIndexSuggestionExtractor { /** * Creates or updates the {@link SuggestionIndex index} at the given path * based on terms extracted from the Lucene search index of the underlying * CRX/Jackrabbit repository. Uses the index of the workspace behind the * given {@link Session}. Various configuration parameters, for example * which fields from the index to use and what stop words list to use, can * be passed via the {@link Options}. * *

* Note that this will take terms from the entire repository. * Restricting it to content in a JCR subtree is not possible as this * information is not stored in the Lucene index. A trick is to store the * text to be indexed in a unique property (and maybe in a separate * workspace or repository), which is not used by other content. This * property would then be specified in {@link Options#properties}. * * @param session * used to access the Lucene index for the workspace * @param indexName * name or path for the {@link SuggestionIndex} to create or * update * @param options * configuration parameters * @return the number of extracted terms */ int buildIndex(Session session, String indexName, Options options); /** * Data object for holding all configuration options for * {@link SearchIndexSuggestionExtractor#buildIndex(Session, String, Options)}. */ static class Options implements Cloneable { /** * Maximum number of suggestions per term. See also * {@link SuggestionIndex#index(java.util.List, int, boolean)}. Defaults * to 10. */ public int maxSuggestions = 10; /** * Whether the index should be minimized. That means (longer) term * prefixes that result in only one suggestion should not be indexed; * only the first prefix that will result in this single suggestion * would be stored, but not for the remaining letters of the suggested * word. See also * {@link SuggestionIndex#index(java.util.List, int, boolean)}. Defaults * to true. */ public boolean minimizeIndex = true; /** * Maximum number of the most frequent terms to extract from the Lucene * index. Use 0 or smaller for no limit. Defaults to 10000. */ public int maxTerms = 10000; /** * Minimum threshold for the frequence of a term to be included. * Defaults to 2. */ public int minFrequency = 2; /** * Minimum length in characters of a term to be included. Defaults to 3. */ public int minTermLength = 3; /** * Path to a stop words file in the repository. These are terms that * will be ignored if found in the Lucene index. * *

* This must be a text file containing one word per line; lines starting * with "#" are ignored, as well as leading and trailing whitespace. * *

* To support multiple stop word files, please merge them in a new file * and point to that one. * *

* Defaults to a built-in english stop words file at * /libs/cq/search/content/suggest/stopwords/en.txt. */ public String stopWordsFile = "/libs/cq/search/content/suggest/stopwords/en.txt"; /** * Property value for {@link #properties} to use for addressing * the node-scoped full text index. */ public static final String FULL_TEXT = "."; /** * JCR properties from which to extract the terms. Use * {@link #FULL_TEXT} for the node-scoped full text index. Defaults to * text and title. */ public Set properties = new HashSet(); /** * Creates new Options. * * @param useDefaultProperties * if the {@link #properties} should be initialized with * default values or not */ public Options(boolean useDefaultProperties) { if (useDefaultProperties) { properties.add("text"); properties.add("title"); } } public Options clone() { try { return (Options) super.clone(); } catch (CloneNotSupportedException e) { // ignore return null; } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy