Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to The Apereo Foundation under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
*
* The Apereo Foundation licenses this file to you under the Educational
* Community License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License
* at:
*
* http://opensource.org/licenses/ecl2.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*/
package org.opencastproject.adminui.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
/**
* Utility class to preprocess potentially malformed Lucene query strings.
*
* The following sanitations are performed:
*
* - Escape special characters that would potentially lead to malformed queries
* - Enable partial search by adding '*' as both prefix and suffix to individual terms
* - Exception: Double-quoted terms
* - Sanitize use of double quotes by appending a double quote at the end of the sanitized query in case
* the closing double quote is missing
* - Ensure that + und - are interpreted in a user-friendly way, i.e. test-unit is not interpreted as test -unit
*/
public final class QueryPreprocessor {
private static final Logger logger = LoggerFactory.getLogger(QueryPreprocessor.class);
private static final char DOUBLE_QUOTE = '"';
private static final char MINUS = '-';
private static final char PLUS = '+';
private static final char ASTERISK = '*';
private static final char EXPLANATION_MARK = '!';
private static final char BACKSLASH = '\\';
private static final char AMPERSAND = '&';
private static final char PIPE = '|';
private static final Set ESCAPED_CHARACTERS = new HashSet(Arrays.asList(
MINUS,
PLUS,
EXPLANATION_MARK,
BACKSLASH,
AMPERSAND,
PIPE,
'(', ')', '{', '}', '[', ']', ':', '^', '~'
));
private static final Set UNARY_OPERATORS = new HashSet(Arrays.asList(
MINUS,
PLUS,
EXPLANATION_MARK
));
private static final Set BINARY_OPERATORS = new HashSet(Arrays.asList("&&", "||"));
private QueryPreprocessor() {
}
/**
* Sanitize a potentially malformed query string so it conforms to the Lucene query syntax
*
* @param query
* potentially malformed Lucene query string
* @return
* sanitized query string
*/
public static String sanitize(String query) {
String sanitizedQuery = "";
String sanitizedToken;
ArrayList tokens = tokenize(query);
int i = 0;
while (i < tokens.size()) {
String token = tokens.get(i);
if (isUnaryOperator(token)) {
sanitizedToken = sanitizeUnaryOperator(token);
} else if (isBinaryOperator(token)) {
if ((i == 0) || isBinaryOperator(tokens.get(i - 1)) || (i >= tokens.size() - 1) || isBinaryOperator(tokens.get(i + 1))) {
// Escape operator since operands missing
sanitizedToken = "" + BACKSLASH + token;
} else {
sanitizedToken = token;
}
} else {
sanitizedToken = enablePartialMatches(token, 0);
}
if (i != 0) {
sanitizedQuery += " ";
}
sanitizedQuery += sanitizedToken;
i++;
}
logger.debug("Sanitized input '{}' to '{}'", query, sanitizedQuery);
return sanitizedQuery;
}
private static boolean isUnaryOperator(String token) {
return (token.length() > 0) && UNARY_OPERATORS.contains(token.charAt(0));
}
private static boolean isBinaryOperator(String token) {
return BINARY_OPERATORS.contains(token);
}
/**
* Helper method to enable partial matching for string literals or operand
*
* @param string
* token to be sanitized
* @param begin
* first character of operand, 0 for string literals
* @return
* the character found at specified position or ' ' if position not within string
*/
private static String enablePartialMatches(String string, int begin) {
String result = string;
char ch = string.charAt(begin);
if ((ch != DOUBLE_QUOTE) && (ch != ASTERISK)) {
result = "";
if (begin > 0) {
result += string.substring(0, begin);
}
result += ASTERISK;
result += string.substring(begin, string.length());
}
ch = result.charAt(result.length() - 1);
if ((ch != DOUBLE_QUOTE) && (ch != ASTERISK)) {
result += ASTERISK;
}
return result;
}
/**
* Helper method to sanitize unary operator tokens
* This method performes the following sanitizitations:
* - Escape unary operator in case of missing argument (ensure syntactical correctness)
* - Enable partial matching for the operand
*
* @param token
* token to be sanitized
* @return
* the character found at specified position or ' ' if position not within string
*/
private static String sanitizeUnaryOperator(String token) {
String sanitizedToken;
if (token.length() == 1) {
// Escape unary operator because of missing operand
sanitizedToken = "" + BACKSLASH + token.charAt(0);
} else {
sanitizedToken = enablePartialMatches(token, 1);
}
return sanitizedToken;
}
/**
* Helper method to (pseudo)-tokenize a character sequence
*
* @param query
* string to be tokenized
* @return
* list of tokens
*/
private static ArrayList tokenize(String query) {
ArrayList tokens = new ArrayList();
String currentToken = "";
boolean openDoubleQuote = false;
int i = 0;
while (i < query.length()) {
char ch = query.charAt(i);
if (ch == DOUBLE_QUOTE) {
if (openDoubleQuote) {
currentToken += DOUBLE_QUOTE;
tokens.add(currentToken);
currentToken = "";
openDoubleQuote = false;
} else if (currentToken.isEmpty()
|| (isUnaryOperator("" + charAt(i - 1, query)) && Character.isWhitespace(charAt(i - 2, query)))) {
currentToken += DOUBLE_QUOTE;
openDoubleQuote = true;
} else {
// Escape double quote character to enforce whitespace separated tokens
currentToken += "" + BACKSLASH + DOUBLE_QUOTE;
}
} else if (openDoubleQuote) {
// No special handling of characters within quoted strings
currentToken += ch;
} else if (isUnaryOperator("" + ch) && Character.isWhitespace(charAt(i - 1, query))) {
// We only allow unary operators as first character of a token
currentToken += ch;
} else if (isBinaryOperator("" + ch + charAt(i + 1, query))
&& Character.isWhitespace(charAt(i - 1, query))
&& Character.isWhitespace(charAt(i + 2, query))) {
// Binary operator detected, i.e. whitespace delimited && or ||
tokens.add("" + ch + ch);
i++; // We nastily skip the binary operator, i.e. we are taken two characters in this round
} else if (Character.isWhitespace(ch)) {
// Whitespace delimits tokens
if (!currentToken.isEmpty()) {
tokens.add(currentToken);
currentToken = "";
}
} else {
if (ESCAPED_CHARACTERS.contains(ch)) {
currentToken += "" + BACKSLASH + ch;
} else {
currentToken += ch;
}
}
i++;
}
if (!currentToken.isEmpty()) {
if (openDoubleQuote) {
// Syntax error detected. We fix this.
currentToken += DOUBLE_QUOTE;
}
tokens.add(currentToken);
}
return tokens;
}
/**
* Helper method to look up characters in strings without resulting in IndexOutOfBound exceptions
*
* @param position
* position within string get the characters
* @param string
* the string we want to lookup a character
* @return
* the character found at specified position or ' ' if position not within string
*/
private static char charAt(int position, String string) {
if ((0 <= position) && (position < string.length())) {
return string.charAt(position);
} else {
return ' ';
}
}
}