com.day.cq.search.suggest.builder.SearchIndexSuggestionExtractor Maven / Gradle / Ivy

/*
 * Copyright 1997-2010 Day Management AG
 * Barfuesserplatz 6, 4001 Basel, Switzerland
 * All Rights Reserved.
 *
 * This software is the confidential and proprietary information of
 * Day Management AG, ("Confidential Information"). You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Day.
 */
package com.day.cq.search.suggest.builder;

import java.util.HashSet;
import java.util.Set;

import javax.jcr.Session;

import com.day.cq.search.suggest.SuggestionIndex;

import aQute.bnd.annotation.ProviderType;

/**
 * A service interface for creating or updating a suggestion index based on
 * repository content using the information from the Lucene search index. Will
 * extract the most frequent terms and build suggestions based on them.
 * 
 * 
 * Note: This only works with CRX / Jackrabbit as the underlying JCR
 * repository, as this has to access the Lucene index directly to extract the
 * terms; the search part of the JCR API does not cover this special case.
 */
@ProviderType
public interface SearchIndexSuggestionExtractor {

    /**
     * Creates or updates the {@link SuggestionIndex index} at the given path
     * based on terms extracted from the Lucene search index of the underlying
     * CRX/Jackrabbit repository. Uses the index of the workspace behind the
     * given {@link Session}. Various configuration parameters, for example
     * which fields from the index to use and what stop words list to use, can
     * be passed via the {@link Options}.
     * 
     * 

     * Note that this will take terms from the entire repository.
     * Restricting it to content in a JCR subtree is not possible as this
     * information is not stored in the Lucene index. A trick is to store the
     * text to be indexed in a unique property (and maybe in a separate
     * workspace or repository), which is not used by other content. This
     * property would then be specified in {@link Options#properties}.
     * 
     * @param session
     *            used to access the Lucene index for the workspace
     * @param indexName
     *            name or path for the {@link SuggestionIndex} to create or
     *            update
     * @param options
     *            configuration parameters
     * @return the number of extracted terms
     */
    int buildIndex(Session session, String indexName, Options options);

    /**
     * Data object for holding all configuration options for
     * {@link SearchIndexSuggestionExtractor#buildIndex(Session, String, Options)}.
     */
    static class Options implements Cloneable {
        /**
         * Maximum number of suggestions per term. See also
         * {@link SuggestionIndex#index(java.util.List, int, boolean)}. Defaults
         * to 10.
         */
        public int maxSuggestions = 10;

        /**
         * Whether the index should be minimized. That means (longer) term
         * prefixes that result in only one suggestion should not be indexed;
         * only the first prefix that will result in this single suggestion
         * would be stored, but not for the remaining letters of the suggested
         * word. See also
         * {@link SuggestionIndex#index(java.util.List, int, boolean)}. Defaults
         * to true.
         */
        public boolean minimizeIndex = true;

        /**
         * Maximum number of the most frequent terms to extract from the Lucene
         * index. Use 0 or smaller for no limit. Defaults to 10000.
         */
        public int maxTerms = 10000;
        
        /**
         * Minimum threshold for the frequence of a term to be included.
         * Defaults to 2.
         */
        public int minFrequency = 2;

        /**
         * Minimum length in characters of a term to be included. Defaults to 3.
         */
        public int minTermLength = 3;

        /**
         * Path to a stop words file in the repository. These are terms that
         * will be ignored if found in the Lucene index.
         * 
         * 

         * This must be a text file containing one word per line; lines starting
         * with "#" are ignored, as well as leading and trailing whitespace.
         * 
         * 

         * To support multiple stop word files, please merge them in a new file
         * and point to that one.
         * 
         * 
         * Defaults to a built-in english stop words file at
         * /libs/cq/search/content/suggest/stopwords/en.txt.
         */
        public String stopWordsFile = "/libs/cq/search/content/suggest/stopwords/en.txt";
        
        /**
         * Property value for {@link #properties} to use for addressing
         * the node-scoped full text index.
         */
        public static final String FULL_TEXT = ".";

        /**
         * JCR properties from which to extract the terms. Use
         * {@link #FULL_TEXT} for the node-scoped full text index. Defaults to
         * text and title.
         */
        public Set properties = new HashSet();

        /**
         * Creates new Options.
         * 
         * @param useDefaultProperties
         *            if the {@link #properties} should be initialized with
         *            default values or not
         */
        public Options(boolean useDefaultProperties) {
            if (useDefaultProperties) {
                properties.add("text");
                properties.add("title");
            }
        }
        
        public Options clone() {
            try {
                return (Options) super.clone();
            } catch (CloneNotSupportedException e) {
                // ignore
                return null;
            }
        }
    }
}