All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencastproject.adminui.util.QueryPreprocessor Maven / Gradle / Ivy

There is a newer version: 16.6
Show newest version
/*
 * Licensed to The Apereo Foundation under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 *
 * The Apereo Foundation licenses this file to you under the Educational
 * Community License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License
 * at:
 *
 *   http://opensource.org/licenses/ecl2.txt
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 */

package org.opencastproject.adminui.util;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * Utility class to preprocess potentially malformed Lucene query strings.
 *
 * The following sanitations are performed:
 *
 * - Escape special characters that would potentially lead to malformed queries
 * - Enable partial search by adding '*' as both prefix and suffix to individual terms
 *   - Exception: Double-quoted terms
 * - Sanitize use of double quotes by appending a double quote at the end of the sanitized query in case
 *   the closing double quote is missing
 * - Ensure that + und - are interpreted in a user-friendly way, i.e. test-unit is not interpreted as test -unit
 */
public final class QueryPreprocessor {

  private static final Logger logger = LoggerFactory.getLogger(QueryPreprocessor.class);

  private static final char DOUBLE_QUOTE = '"';
  private static final char MINUS = '-';
  private static final char PLUS = '+';
  private static final char ASTERISK = '*';
  private static final char EXPLANATION_MARK = '!';
  private static final char BACKSLASH = '\\';
  private static final char AMPERSAND = '&';
  private static final char PIPE = '|';

  private static final Set ESCAPED_CHARACTERS = new HashSet(Arrays.asList(
    MINUS,
    PLUS,
    EXPLANATION_MARK,
    BACKSLASH,
    AMPERSAND,
    PIPE,
    '(', ')', '{', '}', '[', ']', ':', '^', '~'
  ));

  private static final Set UNARY_OPERATORS = new HashSet(Arrays.asList(
    MINUS,
    PLUS,
    EXPLANATION_MARK
  ));

  private static final Set BINARY_OPERATORS = new HashSet(Arrays.asList("&&", "||"));

  private QueryPreprocessor() {

  }

  /**
   * Sanitize a potentially malformed query string so it conforms to the Lucene query syntax
   *
   * @param query
   *          potentially malformed Lucene query string
   * @return
   *        sanitized query string
   */
  public static String sanitize(String query) {
    String sanitizedQuery = "";
    String sanitizedToken;
    ArrayList tokens = tokenize(query);
    int i = 0;
    while (i < tokens.size()) {
      String token = tokens.get(i);

      if (isUnaryOperator(token)) {
        sanitizedToken = sanitizeUnaryOperator(token);
      } else if (isBinaryOperator(token)) {
        if ((i == 0) || isBinaryOperator(tokens.get(i - 1)) || (i >= tokens.size() - 1) || isBinaryOperator(tokens.get(i + 1))) {
          // Escape operator since operands missing
          sanitizedToken = "" + BACKSLASH + token;
        } else {
          sanitizedToken = token;
        }
      } else {
        sanitizedToken = enablePartialMatches(token, 0);
      }

      if (i != 0) {
        sanitizedQuery += " ";
      }
      sanitizedQuery += sanitizedToken;
      i++;
    }
    logger.debug("Sanitized input '{}' to '{}'", query, sanitizedQuery);
    return sanitizedQuery;
  }

  private static boolean isUnaryOperator(String token) {
    return (token.length() > 0) && UNARY_OPERATORS.contains(token.charAt(0));
  }

  private static boolean isBinaryOperator(String token) {
    return BINARY_OPERATORS.contains(token);
  }

  /**
   * Helper method to enable partial matching for string literals or operand
   *
   * @param string
   *          token to be sanitized
   * @param begin
   *          first character of operand, 0 for string literals
   * @return
   *        the character found at specified position or ' ' if position not within string
   */
  private static String enablePartialMatches(String string, int begin) {
    String result = string;

    char ch = string.charAt(begin);
    if ((ch != DOUBLE_QUOTE) && (ch != ASTERISK)) {
      result = "";
      if (begin > 0) {
        result += string.substring(0, begin);
      }
      result += ASTERISK;
      result += string.substring(begin, string.length());
    }

    ch = result.charAt(result.length() - 1);
    if ((ch != DOUBLE_QUOTE) && (ch != ASTERISK)) {
      result += ASTERISK;
    }
    return result;
  }

  /**
   * Helper method to sanitize unary operator tokens
   * This method performes the following sanitizitations:
   * - Escape unary operator in case of missing argument (ensure syntactical correctness)
   * - Enable partial matching for the operand
   *
   * @param token
   *          token to be sanitized
   * @return
   *        the character found at specified position or ' ' if position not within string
   */
  private static String sanitizeUnaryOperator(String token) {
    String sanitizedToken;
    if (token.length() == 1) {
      // Escape unary operator because of missing operand
      sanitizedToken = "" + BACKSLASH + token.charAt(0);
    } else {
      sanitizedToken = enablePartialMatches(token, 1);
    }
    return sanitizedToken;
  }

  /**
   * Helper method to (pseudo)-tokenize a character sequence
   *
   * @param query
   *          string to be tokenized
   * @return
   *        list of tokens
   */
  private static ArrayList tokenize(String query) {

    ArrayList tokens = new ArrayList();
    String currentToken = "";

    boolean openDoubleQuote = false;
    int i = 0;

    while (i < query.length()) {

      char ch = query.charAt(i);

      if (ch == DOUBLE_QUOTE) {
        if (openDoubleQuote) {
          currentToken += DOUBLE_QUOTE;
          tokens.add(currentToken);
          currentToken = "";
          openDoubleQuote = false;
        } else if (currentToken.isEmpty()
                   || (isUnaryOperator("" + charAt(i - 1, query)) && Character.isWhitespace(charAt(i - 2, query)))) {
          currentToken += DOUBLE_QUOTE;
          openDoubleQuote = true;
        } else {
          // Escape double quote character to enforce whitespace separated tokens
          currentToken += "" + BACKSLASH + DOUBLE_QUOTE;
        }
      } else if (openDoubleQuote) {
        // No special handling of characters within quoted strings
        currentToken += ch;
      } else if (isUnaryOperator("" + ch) && Character.isWhitespace(charAt(i - 1, query))) {
        // We only allow unary operators as first character of a token
        currentToken += ch;
      } else if (isBinaryOperator("" + ch + charAt(i + 1, query))
                 && Character.isWhitespace(charAt(i - 1, query))
                 && Character.isWhitespace(charAt(i + 2, query))) {
          // Binary operator detected, i.e. whitespace delimited && or ||
          tokens.add("" + ch + ch);
          i++; // We nastily skip the binary operator, i.e. we are taken two characters in this round
      } else if (Character.isWhitespace(ch)) {
        // Whitespace delimits tokens
        if (!currentToken.isEmpty()) {
          tokens.add(currentToken);
          currentToken = "";
        }
      } else {
        if (ESCAPED_CHARACTERS.contains(ch)) {
          currentToken += "" + BACKSLASH + ch;
        } else {
          currentToken += ch;
        }
      }
      i++;
    }
    if (!currentToken.isEmpty()) {
      if (openDoubleQuote) {
        // Syntax error detected. We fix this.
        currentToken += DOUBLE_QUOTE;
      }
      tokens.add(currentToken);
    }

    return tokens;
  }

  /**
   * Helper method to look up characters in strings without resulting in IndexOutOfBound exceptions
   *
   * @param position
   *          position within string get the characters
   * @param string
   *          the string we want to lookup a character
   * @return
   *        the character found at specified position or ' ' if position not within string
   */
  private static char charAt(int position, String string) {
    if ((0 <= position) && (position < string.length())) {
      return string.charAt(position);
    } else {
      return ' ';
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy