org.modeshape.jcr.query.parse.FullTextSearchParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of modeshape-jcr
ModeShape implementation of the JCR API
There is a newer version: 5.4.1.Final
/*
 * ModeShape (http://www.modeshape.org)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.modeshape.jcr.query.parse;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.modeshape.common.CommonI18n;
import org.modeshape.common.text.ParsingException;
import org.modeshape.common.text.Position;
import org.modeshape.common.text.TokenStream;
import org.modeshape.common.text.TokenStream.CharacterStream;
import org.modeshape.common.text.TokenStream.Token;
import org.modeshape.common.text.TokenStream.Tokenizer;
import org.modeshape.common.text.TokenStream.Tokens;
import org.modeshape.common.util.CheckArg;
import org.modeshape.jcr.api.query.Query;
import org.modeshape.jcr.query.model.AllNodes;
import org.modeshape.jcr.query.model.Column;
import org.modeshape.jcr.query.model.Constraint;
import org.modeshape.jcr.query.model.FullTextSearch;
import org.modeshape.jcr.query.model.FullTextSearch.Conjunction;
import org.modeshape.jcr.query.model.FullTextSearch.Disjunction;
import org.modeshape.jcr.query.model.FullTextSearch.NegationTerm;
import org.modeshape.jcr.query.model.FullTextSearch.SimpleTerm;
import org.modeshape.jcr.query.model.FullTextSearch.Term;
import org.modeshape.jcr.query.model.Limit;
import org.modeshape.jcr.query.model.NullOrder;
import org.modeshape.jcr.query.model.Order;
import org.modeshape.jcr.query.model.Ordering;
import org.modeshape.jcr.query.model.PropertyValue;
import org.modeshape.jcr.query.model.QueryCommand;
import org.modeshape.jcr.query.model.SelectQuery;
import org.modeshape.jcr.query.model.Selector;
import org.modeshape.jcr.query.model.SelectorName;
import org.modeshape.jcr.query.model.TypeSystem;

/**
 * A {@link QueryParser} implementation that parses a full-text search expression. This grammar is based on the full-text search
 * grammar as defined by the JCR 2.0 specification.
 * 
 * 
 * Grammar
 * 
 * The grammar for the full-text expression is taken from the JCR 2.0 specification, and is as follows:
 * 
 * 
 *  * FulltextSearch ::= Disjunct {Space 'OR' Space Disjunct}
 * Disjunct ::= Term {Space Term}
 * Term ::= ['-'] SimpleTerm
 * SimpleTerm ::= Word | '"' Word {Space Word} '"'
 * Word ::= NonSpaceChar {NonSpaceChar}
 * Space ::= SpaceChar {SpaceChar}
 * NonSpaceChar ::= Char - SpaceChar /* Any Char except SpaceChar */
 * SpaceChar ::= ' '
 * Char ::= /* Any character */
 * 
 */
public class FullTextSearchParser implements QueryParser {

    public static final String LANGUAGE = Query.FULL_TEXT_SEARCH;

    private static Selector FULL_TEXT_SOURCE = new AllNodes();
    private static SelectorName FULL_TEXT_SELECTOR_NAME = FULL_TEXT_SOURCE.name();
    private static String SCORE_COLUMN_NAME = "jcr:score";
    protected static List FULL_TEXT_COLUMNS = Collections.singletonList(new Column(FULL_TEXT_SELECTOR_NAME,
                                                                                                     SCORE_COLUMN_NAME,
                                                                                                     SCORE_COLUMN_NAME));
    private static List FULL_TEXT_ORDERING = Collections.singletonList(new Ordering(
                                                                                                        new PropertyValue(
                                                                                                                          FULL_TEXT_SELECTOR_NAME,
                                                                                                                          SCORE_COLUMN_NAME),
                                                                                                        Order.DESCENDING,
                                                                                                        NullOrder.NULLS_LAST));
    private static boolean FULL_TEXT_DISTINCT = false;

    private static FullTextSearchParser PARSER = new FullTextSearchParser();

    @Override
    public String getLanguage() {
        return LANGUAGE;
    }

    @Override
    public QueryCommand parseQuery( String query,
                                    TypeSystem typeSystem ) throws InvalidQueryException {
        // Parse the terms ...
        try {
            PARSER.parse(query);
        } catch (ParsingException e) {
            throw new InvalidQueryException(query, e.getMessage());
        }
        // Now create a query that represents this full-text search ...
        Constraint constraint = new FullTextSearch(FULL_TEXT_SELECTOR_NAME, query);
        return new SelectQuery(FULL_TEXT_SOURCE, constraint, FULL_TEXT_ORDERING, FULL_TEXT_COLUMNS, Limit.NONE,
                               FULL_TEXT_DISTINCT);
    }

    /**
     * Parse the full-text search criteria given in the supplied string.
     * 
     * @param fullTextSearchExpression the full-text search expression; may not be null
     * @return the term representation of the full-text search, or null if there are no terms
     * @throws ParsingException if there is an error parsing the supplied string
     * @throws IllegalArgumentException if the expression is null
     */
    public Term parse( String fullTextSearchExpression ) {
        CheckArg.isNotNull(fullTextSearchExpression, "fullTextSearchExpression");
        Tokenizer tokenizer = new TermTokenizer();
        TokenStream stream = new TokenStream(fullTextSearchExpression, tokenizer, false);
        return parse(stream.start());
    }

    /**
     * Parse the full-text search criteria from the supplied token stream. This method is useful when the full-text search
     * expression is included in other content.
     * 
     * @param tokens the token stream containing the full-text search starting on the next token
     * @return the term representation of the full-text search, or null if there are no terms
     * @throws ParsingException if there is an error parsing the supplied string
     * @throws IllegalArgumentException if the token stream is null
     */
    public Term parse( TokenStream tokens ) {
        CheckArg.isNotNull(tokens, "tokens");
        List terms = new ArrayList();
        do {
            Term term = parseDisjunctedTerms(tokens);
            if (term == null) break;
            terms.add(term);
        } while (tokens.canConsume("OR"));
        if (terms.isEmpty()) return null;
        return terms.size() > 1 ? new Disjunction(terms) : terms.iterator().next();
    }

    protected Term parseDisjunctedTerms( TokenStream tokens ) {
        List terms = new ArrayList();
        do {
            Term term = parseTerm(tokens);
            if (term == null) break;
            terms.add(term);
        } while (tokens.hasNext() && !tokens.matches("OR"));
        if (terms.isEmpty()) return null;
        return terms.size() > 1 ? new Conjunction(terms) : terms.iterator().next();
    }

    protected Term parseTerm( TokenStream tokens ) {
        boolean negated = tokens.canConsume('-');
        if (!negated) tokens.canConsume('+');
        Term result = new SimpleTerm(removeQuotes(tokens.consume()));
        return negated ? new NegationTerm(result) : result;
    }

    /**
     * Remove any leading and trailing single- or double-quotes from the supplied text.
     * 
     * @param text the input text; may not be null
     * @return the text without leading and trailing quotes, or text if there were no quotes
     */
    protected String removeQuotes( String text ) {
        return text.replaceFirst("^['\"]+", "").replaceAll("['\"]+$", "");
    }

    /**
     * A basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the period
     * ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
     * 
     * Note this Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for those situations
     * that happen to be able to use it.
     * 
     */
    public static class TermTokenizer implements Tokenizer {
        /**
         * The {@link Token#type() token type} for tokens that represent an unquoted string containing a character sequence made
         * up of non-whitespace and non-symbol characters.
         */
        public static final int WORD = 1;
        /**
         * The {@link Token#type() token type} for tokens that consist of an individual '+' or '-' characters. The set of
         * characters includes: -+
         */
        public static final int PLUS_MINUS = 2;
        /**
         * The {@link Token#type() token type} for tokens that consist of all the characters within single-quotes. Single quote
         * characters are included if they are preceded (escaped) by a '\' character.
         */
        public static final int SINGLE_QUOTED_STRING = 4;
        /**
         * The {@link Token#type() token type} for tokens that consist of all the characters within double-quotes. Double quote
         * characters are included if they are preceded (escaped) by a '\' character.
         */
        public static final int DOUBLE_QUOTED_STRING = 8;

        protected TermTokenizer() {
        }

        @Override
        public void tokenize( CharacterStream input,
                              Tokens tokens ) throws ParsingException {
            while (input.hasNext()) {
                char c = input.next();
                switch (c) {
                    case ' ':
                    case '\t':
                    case '\n':
                    case '\r':
                        // Just skip these whitespace characters ...
                        break;
                    case '-':
                    case '+':
                        tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, PLUS_MINUS);
                        break;
                    case '\"':
                        int startIndex = input.index();
                        Position startingPosition = input.position(startIndex);
                        boolean foundClosingQuote = false;
                        while (input.hasNext()) {
                            c = input.next();
                            if (c == '\\' && input.isNext('"')) {
                                c = input.next(); // consume the ' character since it is escaped
                            } else if (c == '"') {
                                foundClosingQuote = true;
                                break;
                            }
                        }
                        if (!foundClosingQuote) {
                            String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
                                                                                    startingPosition.getColumn());
                            throw new ParsingException(startingPosition, msg);
                        }
                        int endIndex = input.index() + 1; // beyond last character read
                        tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
                        break;
                    case '\'':
                        startIndex = input.index();
                        startingPosition = input.position(startIndex);
                        foundClosingQuote = false;
                        while (input.hasNext()) {
                            c = input.next();
                            if (c == '\\' && input.isNext('\'')) {
                                c = input.next(); // consume the ' character since it is escaped
                            } else if (c == '\'') {
                                foundClosingQuote = true;
                                break;
                            }
                        }
                        if (!foundClosingQuote) {
                            String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
                                                                                    startingPosition.getColumn());
                            throw new ParsingException(startingPosition, msg);
                        }
                        endIndex = input.index() + 1; // beyond last character read
                        tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
                        break;
                    default:
                        startIndex = input.index();
                        startingPosition = input.position(startIndex);
                        // Read until another whitespace is found
                        while (input.hasNext() && !(input.isNextWhitespace())) {
                            c = input.next();
                        }
                        endIndex = input.index() + 1; // beyond last character that was included
                        tokens.addToken(startingPosition, startIndex, endIndex, WORD);
                }
            }
        }
    }

}