All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sap.cds.adapter.odata.v2.search.SearchTokenizer Maven / Gradle / Ivy

There is a newer version: 3.6.0
Show newest version
/**************************************************************************
 * (C) 2019-2024 SAP SE or an SAP affiliate company. All rights reserved. *
 **************************************************************************/
package com.sap.cds.adapter.odata.v2.search;

import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.sap.cds.services.utils.CdsErrorStatuses;
import com.sap.cds.services.utils.ErrorStatusException;

/**
 * 
 * searchExpr    = ( OPEN BWS searchExpr BWS CLOSE / searchTerm )
 *                 [ searchOrExpr / searchAndExpr ]
 * searchOrExpr  = RWS 'OR' RWS searchExpr
 * searchAndExpr = RWS [ 'AND' RWS ] searchExpr
 * searchTerm    = [ 'NOT' RWS ] ( searchPhrase / searchWord )
 * searchPhrase  = quotation-mark 1*qchar-no-AMP-DQUOTE quotation-mark
 * searchWord    = 1*ALPHA ; Actually: any character from the Unicode categories L or Nl,
 *                                     but not the words AND, OR, and NOT
 * 
* * ATTENTION: This class does not support a percent-encoded * searchPhrase because the URI parser percent decodes * each query before calling parsers of query options. */ public class SearchTokenizer { private final static Logger logger = LoggerFactory.getLogger(SearchTokenizer.class); private static abstract class State implements SearchQueryToken { private Token token = null; private boolean finished = false; protected static final char QUOTATION_MARK = '\"'; protected static final char PHRASE_ESCAPE_CHAR = '\\'; protected static final char CHAR_N = 'N'; protected static final char CHAR_O = 'O'; protected static final char CHAR_T = 'T'; protected static final char CHAR_A = 'A'; protected static final char CHAR_D = 'D'; protected static final char CHAR_R = 'R'; protected static final char CHAR_CLOSE = ')'; protected static final char CHAR_OPEN = '('; public State() { } public State(final Token t) { token = t; } public State(final Token t, final boolean finished) { this(t); this.finished = finished; } protected abstract State nextChar(char c); /** @param c allowed character */ public State allowed(final char c) { return this; } public State forbidden(final char c) { logger.error("Forbidden character in state " + token + "->" + c); throw new ErrorStatusException(CdsErrorStatuses.SEARCH_PARSING_FAILED); } public State invalid() { logger.error("Token " + token + " is in invalid state."); throw new ErrorStatusException(CdsErrorStatuses.SEARCH_PARSING_FAILED); } public State finish() { finished = true; return this; } public State finishAs(final Token token) { finished = true; return changeToken(token); } public boolean isFinished() { return finished; } @Override public Token getToken() { return token; } public State close() { return this; } protected State changeToken(final Token token) { this.token = token; return this; } static boolean isAllowedWord(final char character) { return Character.isUnicodeIdentifierStart(character) || isUnreserved(character) || isOtherDelimsForWord(character); } /** * * searchPhrase = quotation-mark 1*qchar-no-AMP-DQUOTE quotation-mark *

* qchar-no-AMP-DQUOTE = qchar-unescaped / escape ( escape / quotation-mark ) *

* qchar-unescaped = unreserved / pct-encoded-unescaped / other-delims / * ":" / "@" / "/" / "?" / "$" / "'" / "=" *

* unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" *

* escape = "\" / "%5C" ; reverse solidus U+005C *

* pct-encoded-unescaped = "%" ( "0" / "1" / "3" / "4" / "6" / "7" / "8" / "9" / A-to-F ) HEXDIG * / "%" "2" ( "0" / "1" / "3" / "4" / "5" / "6" / "7" / "8" / "9" / A-to-F ) * / "%" "5" ( DIGIT / "A" / "B" / "D" / "E" / "F" ) *

* other-delims = "!" / "(" / ")" / "*" / "+" / "," / ";" *

* quotation-mark = DQUOTE / "%22" *

* ALPHA = %x41-5A / %x61-7A *
* DIGIT = %x30-39 *
* DQUOTE = %x22 *
* * Checks if given character is allowed for a search phrase. * ATTENTION: Escaping and percent encoding is not be validated here (and * can not be validated on a single character).
* Hence for the {@link #PHRASE_ESCAPE_CHAR} and the {@link #QUOTATION_MARK} * characters this method will return FALSE.
* Furthermore percent encoded characters are also not validated (and can * not be validated on a single character).
* Hence for the % character this method assumeS that it was * percent encoded and is now decoded and will return TRUE.
* * @param character which is checked * @return true if character is allowed for a phrase */ static boolean isAllowedPhrase(final char character) { return character != '"'; } /** * other-delims = "!" / "(" / ")" / "*" / "+" / "," / ";" * * @param character which is checked * @return true if character is allowed */ private static boolean isOtherDelimsForWord(final char character) { return character == '!' || character == '*' || character == '+' || character == ':' || character == '@' || character == '/' || character == '\\' || character == '?' || character == '$' || character == '=' || character == '%' || character == '\'' || character == '&' || character == '{' || character == '}' || character == '[' || character == ']' || character == ',' || character == '#' || character == '^' || character == '|' || character == '>' || character == '<' || character == '`' || character == ';'; } /** * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" * * @param character which is checked * @return true if character is allowed */ private static boolean isUnreserved(final char character) { return isAlphaOrDigit(character) || character == '-' || character == '.' || character == '_' || character == '~'; } /** * ALPHA = %x41-5A / %x61-7A DIGIT = %x30-39 * * @param character which is checked * @return true if character is allowed */ private static boolean isAlphaOrDigit(final char character) { return ('A' <= character && character <= 'Z') // case A..Z || ('a' <= character && character <= 'z') // case a..z || ('0' <= character && character <= '9'); // case 0..9 } // BWS = *( SP / HTAB / "%20" / "%09" ) ; "bad" whitespace // RWS = 1*( SP / HTAB / "%20" / "%09" ) ; "required" whitespace static boolean isWhitespace(final char character) { return character == ' ' || character == '\t'; } @Override public String getLiteral() { return token.toString(); } @Override public String toString() { return token + "=>{" + getLiteral() + "}"; } } private static abstract class LiteralState extends State { protected final StringBuilder literal = new StringBuilder(); public LiteralState() { super(); } public LiteralState(final Token t, final char c) { super(t); init(c); } public LiteralState(final Token t, final String initLiteral) { super(t); literal.append(initLiteral); } @Override public State allowed(final char c) { literal.append(c); return this; } @Override public String getLiteral() { return literal.toString(); } public State init(final char c) { if (isFinished()) { logger.error(toString() + " is already finished."); throw new ErrorStatusException(CdsErrorStatuses.SEARCH_PARSING_FAILED); } literal.append(c); return this; } } private class SearchExpressionState extends LiteralState { @Override public State nextChar(final char c) { if (c == CHAR_OPEN) { return new OpenState(); } else if (isWhitespace(c)) { return new RwsState(); } else if (c == CHAR_CLOSE) { return new CloseState(); } else { return new SearchTermState().init(c); } } @Override public State init(final char c) { return nextChar(c); } } private class SearchTermState extends LiteralState { @Override public State nextChar(final char c) { if (c == CHAR_N) { return new NotState(c); } else if (c == QUOTATION_MARK) { return new SearchPhraseState(c); } else if (isAllowedWord(c)) { return new SearchWordState(c); } return forbidden(c); } @Override public State init(final char c) { return nextChar(c); } } /** * * As per the updated abnf * https://github.com/oasis-tcs/odata-abnf/blob/master/abnf/odata-abnf-construction-rules.txt#L332-L356. * searchWord = 1*( ALPHA / DIGIT / COMMA / "." / "-" / pct-encoded ) This * includes Unicode characters of categories L or N using UTF-8 and * percent-encoding. */ private class SearchWordState extends LiteralState { public SearchWordState(final char c) { super(Token.WORD, c); if (!isAllowedWord(c)) { forbidden(c); } } public SearchWordState(final State toConsume) { super(Token.WORD, toConsume.getLiteral()); for (int i = 0; i < literal.length(); i++) { if (!isAllowedWord(literal.charAt(i))) { forbidden(literal.charAt(i)); } } } @Override public State nextChar(final char c) { if (isAllowedWord(c)) { return allowed(c); } else if (c == CHAR_CLOSE) { finish(); return new CloseState(); } else if (isWhitespace(c)) { finish(); return new RwsState(); } return forbidden(c); } @Override public State finish() { String tmpLiteral = literal.toString(); if (tmpLiteral.length() == 3) { if (Token.AND.name().equals(tmpLiteral)) { return finishAs(Token.AND); } else if (Token.NOT.name().equals(tmpLiteral)) { return finishAs(Token.NOT); } } else if (tmpLiteral.length() == 2 && Token.OR.name().equals(tmpLiteral)) { return finishAs(Token.OR); } return super.finish(); } @Override public State close() { return finish(); } } private class SearchPhraseState extends LiteralState { private boolean closed = false; private boolean escaped = false; public SearchPhraseState(final char c) { super(Token.PHRASE, c); if (c != QUOTATION_MARK) { forbidden(c); } } @Override public State nextChar(final char c) { if (closed) { finish(); if (c == CHAR_CLOSE) { return new CloseState(); } else if (isWhitespace(c)) { return new RwsState(); } } else if (escaped) { escaped = false; if (c == QUOTATION_MARK || c == PHRASE_ESCAPE_CHAR) { return allowed(c); } else { return forbidden(c); } } else if (c == PHRASE_ESCAPE_CHAR) { escaped = true; return this; } else if (isAllowedPhrase(c)) { return allowed(c); } else if (isWhitespace(c)) { return allowed(c); } else if (c == QUOTATION_MARK) { if (literal.length() == 1) { return invalid(); } closed = true; return allowed(c); } return forbidden(c); } @Override public State close() { if (closed) { return finish(); } return invalid(); } } private class OpenState extends State { public OpenState() { super(Token.OPEN, true); } @Override public State nextChar(final char c) { finish(); if (isWhitespace(c)) { return forbidden(c); } return new SearchExpressionState().init(c); } } private class CloseState extends State { public CloseState() { super(Token.CLOSE, true); } @Override public State nextChar(final char c) { return new SearchExpressionState().init(c); } } private class NotState extends LiteralState { public NotState(final char c) { super(Token.NOT, c); if (c != CHAR_N) { forbidden(c); } } @Override public State nextChar(final char c) { if (literal.length() == 1 && c == CHAR_O) { return allowed(c); } else if (literal.length() == 2 && c == CHAR_T) { return allowed(c); } else if (literal.length() == 3 && isWhitespace(c)) { finish(); return new BeforePhraseOrWordRwsState(); } else if (isWhitespace(c)) { changeToken(Token.WORD).finish(); return new RwsState(); } literal.append(c); return new SearchWordState(this); } @Override public State close() { if (Token.NOT.name().contentEquals(literal)) { return finish(); } return changeToken(Token.WORD).finish(); } } private class AndState extends LiteralState { public AndState(final char c) { super(Token.AND, c); if (c != CHAR_A) { forbidden(c); } } @Override public State nextChar(final char c) { if (literal.length() == 1 && c == CHAR_N) { return allowed(c); } else if (literal.length() == 2 && c == CHAR_D) { return allowed(c); } else if (literal.length() == 3 && isWhitespace(c)) { finish(); return new BeforeSearchExpressionRwsState(); } else if (isWhitespace(c)) { changeToken(Token.WORD).finish(); return new RwsState(); } literal.append(c); return new SearchWordState(this); } @Override public State close() { if (Token.AND.name().contentEquals(literal)) { return finish(); } return changeToken(Token.WORD).finish(); } } private class OrState extends LiteralState { public OrState(final char c) { super(Token.OR, c); if (c != CHAR_O) { forbidden(c); } } @Override public State nextChar(final char c) { if (literal.length() == 1 && (c == CHAR_R)) { return allowed(c); } else if (literal.length() == 2 && isWhitespace(c)) { finish(); return new BeforeSearchExpressionRwsState(); } else if (isWhitespace(c)) { changeToken(Token.WORD).finish(); return new RwsState(); } literal.append(c); return new SearchWordState(this); } @Override public State close() { if (Token.OR.name().contentEquals(literal)) { return finish(); } return changeToken(Token.WORD).finish(); } } // RWS 'OR' RWS searchExpr // RWS [ 'AND' RWS ] searchExpr private class BeforeSearchExpressionRwsState extends State { @Override public State nextChar(final char c) { if (isWhitespace(c)) { return allowed(c); } else { return new SearchExpressionState().init(c); } } } private class BeforePhraseOrWordRwsState extends State { @Override public State nextChar(final char c) { if (isWhitespace(c)) { return allowed(c); } else if (c == QUOTATION_MARK) { return new SearchPhraseState(c); } else { return new SearchWordState(c); } } } private class RwsState extends State { @Override public State nextChar(final char c) { if (isWhitespace(c)) { return allowed(c); } else if (c == CHAR_O) { return new OrState(c); } else if (c == CHAR_A) { return new AndState(c); } else { return new SearchExpressionState().init(c); } } } /** * Takes the search query and splits it into a list of corresponding * {@link SearchQueryToken}s. Before splitting it into tokens, leading and * trailing whitespace in the given search query string is removed. * * @param searchQuery search query to be tokenized * @return list of tokens */ public List tokenize(final String searchQuery) { if (searchQuery.contains("%28") || searchQuery.contains("%29") || searchQuery.contains("%22")) { logger.error("Invalid Token in Query string '"); throw new ErrorStatusException(CdsErrorStatuses.SEARCH_PARSING_FAILED); } char[] chars = searchQuery.trim().toCharArray(); State state = new SearchExpressionState(); List states = new ArrayList<>(); for (char aChar : chars) { State next = state.nextChar(aChar); if (state.isFinished()) { states.add(state); } state = next; } if (state.close().isFinished()) { states.add(state); } else { logger.error("Last parsed state '" + state.toString() + "' is not finished."); throw new ErrorStatusException(CdsErrorStatuses.SEARCH_PARSING_FAILED); } return states; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy