com.day.cq.commons.servlets.AbstractSearchServlet Maven / Gradle / Ivy

/*
 * Copyright 1997-2009 Day Management AG
 * Barfuesserplatz 6, 4001 Basel, Switzerland
 * All Rights Reserved.
 *
 * This software is the confidential and proprietary information of
 * Day Management AG, ("Confidential Information"). You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Day.
 */
package com.day.cq.commons.servlets;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;

import com.day.cq.commons.servlets.AbstractPredicateServlet;

/**
 * AbstractSearchServlet is a base class for search servlets.
 */
public abstract class AbstractSearchServlet extends AbstractPredicateServlet {

    private static final long serialVersionUID = 6105423525102347224L;

    /** Query clause */
    public static final String QUERY = "query";

    /** Start index */
    public static final String START = "start";

    /** Result limit */
    public static final String LIMIT = "limit";

    /** tidy param */
    public static final String TIDY = "tidy";

    /**
     * List of unicode blocks that contain characters that act as words.
     */
    public static final List WORD_CHARS;

    /**
     * Split terms at these characters. This list might not bee complete and
     * is only a quick hack to fix bug# 27080.
     */
    public static final String SPLIT_CHARACTERS = " _-.,";

    static {
        // this list should be kept roughtly synchronized with
        // the standard lucene tokenizer!
        List list = new ArrayList();
        // Chinese and Japanese
        list.add(Character.UnicodeBlock.HIRAGANA);
        list.add(Character.UnicodeBlock.KATAKANA);
        list.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
        list.add(Character.UnicodeBlock.BOPOMOFO);
        list.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
        list.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
        list.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
        list.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
        list.add(Character.UnicodeBlock.SPECIALS);
        // Korean
        list.add(Character.UnicodeBlock.HANGUL_SYLLABLES);
        list.add(Character.UnicodeBlock.HANGUL_JAMO);
        WORD_CHARS = Collections.unmodifiableList(list);
    }

    /**
     * @param text the text to check.
     * @return true if text is a single word;
     *         false otherwise.
     */
    protected boolean isSingleWord(String text) {
        for (int i = 0; i < text.length(); i++) {
            if (WORD_CHARS.contains(Character.UnicodeBlock.of(text.charAt(i)))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Conditionally appends a wildcard to the query text if the
     * text is not considered a single word. This method also breaks
     * the text into multiple terms as {@link #SPLIT_CHARACTERS}. The wildcard
     * is only added to the last term.
     * 
     * See also: {@link #isSingleWord(String)}.
     *
     * @param text the query text.
     * @return the processed query text, possibly with appended '*' wildcard.
     */
    protected String applyWildcard(String text) {
        // only append * if query string is a single word
        if (!isSingleWord(text)) {
            return text;
        }
        StringBuffer modified = new StringBuffer();
        StringTokenizer t = new StringTokenizer(text, SPLIT_CHARACTERS);
        String space = "";
        while (t.hasMoreTokens()) {
            modified.append(space);
            space = " ";
            modified.append(t.nextToken());
        }
        modified.append("*");
        return modified.toString();
    }
}