org.modeshape.jcr.query.parse.FullTextSearchParser Maven / Gradle / Ivy
/*
* ModeShape (http://www.modeshape.org)
* See the COPYRIGHT.txt file distributed with this work for information
* regarding copyright ownership. Some portions may be licensed
* to Red Hat, Inc. under one or more contributor license agreements.
* See the AUTHORS.txt file in the distribution for a full listing of
* individual contributors.
*
* ModeShape is free software. Unless otherwise indicated, all code in ModeShape
* is licensed to you under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* ModeShape is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.modeshape.jcr.query.parse;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.modeshape.common.CommonI18n;
import org.modeshape.common.text.ParsingException;
import org.modeshape.common.text.Position;
import org.modeshape.common.text.TokenStream;
import org.modeshape.common.text.TokenStream.CharacterStream;
import org.modeshape.common.text.TokenStream.Token;
import org.modeshape.common.text.TokenStream.Tokenizer;
import org.modeshape.common.text.TokenStream.Tokens;
import org.modeshape.common.util.CheckArg;
import org.modeshape.jcr.api.query.Query;
import org.modeshape.jcr.query.model.AllNodes;
import org.modeshape.jcr.query.model.Column;
import org.modeshape.jcr.query.model.Constraint;
import org.modeshape.jcr.query.model.FullTextSearch;
import org.modeshape.jcr.query.model.FullTextSearch.Conjunction;
import org.modeshape.jcr.query.model.FullTextSearch.Disjunction;
import org.modeshape.jcr.query.model.FullTextSearch.NegationTerm;
import org.modeshape.jcr.query.model.FullTextSearch.SimpleTerm;
import org.modeshape.jcr.query.model.FullTextSearch.Term;
import org.modeshape.jcr.query.model.Limit;
import org.modeshape.jcr.query.model.Order;
import org.modeshape.jcr.query.model.Ordering;
import org.modeshape.jcr.query.model.PropertyValue;
import org.modeshape.jcr.query.model.QueryCommand;
import org.modeshape.jcr.query.model.SelectQuery;
import org.modeshape.jcr.query.model.Selector;
import org.modeshape.jcr.query.model.SelectorName;
import org.modeshape.jcr.query.model.TypeSystem;
/**
* A {@link QueryParser} implementation that parses a full-text search expression. This grammar is based on the full-text search
* grammar as defined by the JCR 2.0 specification.
*
*
* Grammar
*
* The grammar for the full-text expression is taken from the JCR 2.0 specification, and is as follows:
*
*
*
* FulltextSearch ::= Disjunct {Space 'OR' Space Disjunct}
* Disjunct ::= Term {Space Term}
* Term ::= ['-'] SimpleTerm
* SimpleTerm ::= Word | '"' Word {Space Word} '"'
* Word ::= NonSpaceChar {NonSpaceChar}
* Space ::= SpaceChar {SpaceChar}
* NonSpaceChar ::= Char - SpaceChar /* Any Char except SpaceChar */
* SpaceChar ::= ' '
* Char ::= /* Any character */
*
*/
public class FullTextSearchParser implements QueryParser {
public static final String LANGUAGE = Query.FULL_TEXT_SEARCH;
private static Selector FULL_TEXT_SOURCE = new AllNodes();
private static SelectorName FULL_TEXT_SELECTOR_NAME = FULL_TEXT_SOURCE.name();
private static String SCORE_COLUMN_NAME = "jcr:score";
protected static List extends Column> FULL_TEXT_COLUMNS = Collections.singletonList(new Column(FULL_TEXT_SELECTOR_NAME,
SCORE_COLUMN_NAME,
SCORE_COLUMN_NAME));
private static List extends Ordering> FULL_TEXT_ORDERING = Collections.singletonList(new Ordering(
new PropertyValue(
FULL_TEXT_SELECTOR_NAME,
SCORE_COLUMN_NAME),
Order.DESCENDING));
private static boolean FULL_TEXT_DISTINCT = true;
private static FullTextSearchParser PARSER = new FullTextSearchParser();
@Override
public String getLanguage() {
return LANGUAGE;
}
@Override
public QueryCommand parseQuery( String query,
TypeSystem typeSystem ) throws InvalidQueryException {
// Parse the terms ...
try {
PARSER.parse(query);
} catch (ParsingException e) {
throw new InvalidQueryException(query, e.getMessage());
}
// Now create a query that represents this full-text search ...
Constraint constraint = new FullTextSearch(FULL_TEXT_SELECTOR_NAME, query);
return new SelectQuery(FULL_TEXT_SOURCE, constraint, FULL_TEXT_ORDERING, FULL_TEXT_COLUMNS, Limit.NONE,
FULL_TEXT_DISTINCT);
}
/**
* Parse the full-text search criteria given in the supplied string.
*
* @param fullTextSearchExpression the full-text search expression; may not be null
* @return the term representation of the full-text search, or null if there are no terms
* @throws ParsingException if there is an error parsing the supplied string
* @throws IllegalArgumentException if the expression is null
*/
public Term parse( String fullTextSearchExpression ) {
CheckArg.isNotNull(fullTextSearchExpression, "fullTextSearchExpression");
Tokenizer tokenizer = new TermTokenizer();
TokenStream stream = new TokenStream(fullTextSearchExpression, tokenizer, false);
return parse(stream.start());
}
/**
* Parse the full-text search criteria from the supplied token stream. This method is useful when the full-text search
* expression is included in other content.
*
* @param tokens the token stream containing the full-text search starting on the next token
* @return the term representation of the full-text search, or null if there are no terms
* @throws ParsingException if there is an error parsing the supplied string
* @throws IllegalArgumentException if the token stream is null
*/
public Term parse( TokenStream tokens ) {
CheckArg.isNotNull(tokens, "tokens");
List terms = new ArrayList();
do {
Term term = parseDisjunctedTerms(tokens);
if (term == null) break;
terms.add(term);
} while (tokens.canConsume("OR"));
if (terms.isEmpty()) return null;
return terms.size() > 1 ? new Disjunction(terms) : terms.iterator().next();
}
protected Term parseDisjunctedTerms( TokenStream tokens ) {
List terms = new ArrayList();
do {
Term term = parseTerm(tokens);
if (term == null) break;
terms.add(term);
} while (tokens.hasNext() && !tokens.matches("OR"));
if (terms.isEmpty()) return null;
return terms.size() > 1 ? new Conjunction(terms) : terms.iterator().next();
}
protected Term parseTerm( TokenStream tokens ) {
boolean negated = tokens.canConsume('-');
if (!negated) tokens.canConsume('+');
Term result = new SimpleTerm(removeQuotes(tokens.consume()));
return negated ? new NegationTerm(result) : result;
}
/**
* Remove any leading and trailing single- or double-quotes from the supplied text.
*
* @param text the input text; may not be null
* @return the text without leading and trailing quotes, or text
if there were no quotes
*/
protected String removeQuotes( String text ) {
return text.replaceFirst("^['\"]+", "").replaceAll("['\"]+$", "");
}
/**
* A basic {@link Tokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the period
* ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
*
* Note this Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for those situations
* that happen to be able to use it.
*
*/
public static class TermTokenizer implements Tokenizer {
/**
* The {@link Token#type() token type} for tokens that represent an unquoted string containing a character sequence made
* up of non-whitespace and non-symbol characters.
*/
public static final int WORD = 1;
/**
* The {@link Token#type() token type} for tokens that consist of an individual '+' or '-' characters. The set of
* characters includes: -+
*/
public static final int PLUS_MINUS = 2;
/**
* The {@link Token#type() token type} for tokens that consist of all the characters within single-quotes. Single quote
* characters are included if they are preceded (escaped) by a '\' character.
*/
public static final int SINGLE_QUOTED_STRING = 4;
/**
* The {@link Token#type() token type} for tokens that consist of all the characters within double-quotes. Double quote
* characters are included if they are preceded (escaped) by a '\' character.
*/
public static final int DOUBLE_QUOTED_STRING = 8;
protected TermTokenizer() {
}
@Override
public void tokenize( CharacterStream input,
Tokens tokens ) throws ParsingException {
while (input.hasNext()) {
char c = input.next();
switch (c) {
case ' ':
case '\t':
case '\n':
case '\r':
// Just skip these whitespace characters ...
break;
case '-':
case '+':
tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, PLUS_MINUS);
break;
case '\"':
int startIndex = input.index();
Position startingPosition = input.position(startIndex);
boolean foundClosingQuote = false;
while (input.hasNext()) {
c = input.next();
if (c == '\\' && input.isNext('"')) {
c = input.next(); // consume the ' character since it is escaped
} else if (c == '"') {
foundClosingQuote = true;
break;
}
}
if (!foundClosingQuote) {
String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
startingPosition.getColumn());
throw new ParsingException(startingPosition, msg);
}
int endIndex = input.index() + 1; // beyond last character read
tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
break;
case '\'':
startIndex = input.index();
startingPosition = input.position(startIndex);
foundClosingQuote = false;
while (input.hasNext()) {
c = input.next();
if (c == '\\' && input.isNext('\'')) {
c = input.next(); // consume the ' character since it is escaped
} else if (c == '\'') {
foundClosingQuote = true;
break;
}
}
if (!foundClosingQuote) {
String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
startingPosition.getColumn());
throw new ParsingException(startingPosition, msg);
}
endIndex = input.index() + 1; // beyond last character read
tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
break;
default:
startIndex = input.index();
startingPosition = input.position(startIndex);
// Read until another whitespace is found
while (input.hasNext() && !(input.isNextWhitespace())) {
c = input.next();
}
endIndex = input.index() + 1; // beyond last character that was included
tokens.addToken(startingPosition, startIndex, endIndex, WORD);
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy