org.modeshape.jcr.sequencer.PathExpression Maven / Gradle / Ivy
/* * ModeShape (http://www.modeshape.org) * See the COPYRIGHT.txt file distributed with this work for information * regarding copyright ownership. Some portions may be licensed * to Red Hat, Inc. under one or more contributor license agreements. * See the AUTHORS.txt file in the distribution for a full listing of * individual contributors. * * ModeShape is free software. Unless otherwise indicated, all code in ModeShape * is licensed to you under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * ModeShape is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.modeshape.jcr.sequencer; import java.io.Serializable; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.modeshape.common.annotation.Immutable; import org.modeshape.common.util.CheckArg; import org.modeshape.common.util.HashCode; import org.modeshape.common.util.ObjectUtil; import org.modeshape.jcr.GraphI18n; /** * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to * represent node paths or properties. *
"), any sequence of nodes ("* Let's first look at some simple examples of path expressions: *
**
** *Path expression *Description ** */a/b *Match node " *b
" that is a child of the top level node "a
". Neither node may have any * same-name-sibilings.* */a/* *Match any child node of the top level node " *a
".* */a/*.txt *Match any child node of the top level node " *a
" that also has a name ending in ".txt
".* */a/b@c *Match the property " *c
" of node "/a/b
".* */a/b[2] *The second child named " *b
" below the top level node "a
".* */a/b[2,3,4] *The second, third or fourth child named " *b
" below the top level node "a
".* */a/b[*] *Any (and every) child named " *b
" below the top level node "a
".* *//a/b *Any node named " *b
" that exists below a node named "a
", regardless of where node "a
" * occurs. Again, neither node may have any same-name-sibilings.* With these simple examples, you can probably discern the most important rules. First, the '
**
' is a wildcard * character that matches any character or sequence of characters in a node's name (or index if appearing in between square * brackets), and can be used in conjunction with other characters (e.g., "*.txt
"). ** Second, square brackets (i.e., '
*[
' and ']
') are used to match a node's same-name-sibiling index. You * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no * same-name-sibilings, or any positive number to match the specific same-name-sibling. ** Third, combining two delimiters (e.g., "
*//
") matches any sequence of nodes, regardless of what their names are or * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential * slash characters are treated as two. ** Many path expressions can be created using just these simple rules. However, input paths can be more complicated. Here are some * more examples: *
**
** *Path expressions *Description ** */a/(b|c|d) *Match children of the top level node " *a
" that are named "a
", "b
" or "c
* ". None of the nodes may have same-name-sibling indexes.* */a/b[c/d] *Match node " *b
" child of the top level node "a
", when node "b
" has a child named " *c
", and "c
" has a child named "d
". Node "b
* " is the selected node, while nodes "b
" and "b
" are used as criteria but are not selected.* */a(/(b|c|d|)/e)[f/g/@something] *Match node " */a/b/e
", "/a/c/e
", "/a/d/e
", or "/a/e
* " when they also have a child "f
" that itself has a child "g
" with property "something
". * None of the nodes may have same-name-sibling indexes.* These examples show a few more advanced rules. Parentheses (i.e., '
*(
' and ')
') can be used to define * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node " */a/b
", and "b" would be captured and could be used within the output path using "$1
", where the * number used in the output path identifies the parentheses. ** Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square * brackets does not appear in the selected node. *
*Workspace names
** Path expressions can also specify restrictions on the workspace name to constrain the path expression to matching only paths * from certain workspaces meeting the name criteria. Of course, if the path expression doesn't include these restrictions, the * workspace name are not considered when matching paths. *
*/ @Immutable public class PathExpression implements Serializable { /** * Initial version */ private static final long serialVersionUID = 1L; /** * Compile the supplied expression and return the resulting path expression instance. * * @param expression the expression * @return the path expression; never null * @throws IllegalArgumentException if the expression is null * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression */ public static final PathExpression compile( String expression ) throws InvalidPathExpressionException { return new PathExpression(expression); } private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\] private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING); /** * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates * in group 1. Note that some predicates may be valid at the end but not valid elsewhere. ** Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties * are allowed only as the last predicate. Predicates with any operators are unused. *
** Nested predicates are not currently allowed. *
*/ // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\]) private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])"; private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING); /** * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the * predicates to be removed in group 1. */ // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\]) private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])"; private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING); /** * The regular expression that is used to extract the workspace name and path from an path expression (or a real path). The * regular expression is(([^:/]*):)?(.*)
. Group 2 will contain the workspace name and group 3 the path. */ private static final String WORKSPACE_AND_PATH_PATTERN_STRING = "(([^:/]*):)?(.*)"; private static final Pattern WORKSPACE_AND_PATH_PATTERN = Pattern.compile(WORKSPACE_AND_PATH_PATTERN_STRING); private final String expression; /** * This is the pattern that is used to determine if the particular path is from a particular workspace. This pattern will be * null if the expression does not constrain the workspace name. */ private final Pattern workspacePattern; /** * This is the pattern that is used to determine if there is a match with particular paths. */ private final Pattern matchPattern; /** * This is the pattern that is used to determine which parts of the particular input paths are included in the * {@link Matcher#getSelectedNodePath() selected path}, only after the input path has already matched. */ private final Pattern selectPattern; /** * Create the supplied expression. * * @param expression the expression * @throws IllegalArgumentException if the expression is null * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression */ public PathExpression( String expression ) throws InvalidPathExpressionException { CheckArg.isNotNull(expression, "path expression"); this.expression = expression.trim(); if (this.expression.length() == 0) { throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text()); } // Separate out the repository name, workspace name, and path fragments into separate match patterns ... WorkspacePath repoPath = parsePathInWorkspace(this.expression); if (repoPath == null) { throw new InvalidPathExpressionException(GraphI18n.pathExpressionHasInvalidMatch.text(this.expression, this.expression)); } String workPatternStr = repoPath.workspaceName != null ? repoPath.workspaceName : ".*"; String pathPatternStr = repoPath.path; this.workspacePattern = Pattern.compile(workPatternStr); // Build the repository match pattern ... // Build the match pattern, which determines whether a path matches the condition ... String matchString = pathPatternStr; try { matchString = removeUnusedPredicates(matchString); matchString = replaceXPathPatterns(matchString); this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression); throw new InvalidPathExpressionException(msg, e); } // Build the select pattern, which determines the path that will be selected ... String selectString = pathPatternStr; try { selectString = removeAllPredicatesExceptIndexes(selectString); selectString = replaceXPathPatterns(selectString); selectString = "(" + selectString + ").*"; // group 1 will have selected path ... this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression); throw new InvalidPathExpressionException(msg, e); } } /** * @return expression */ public String getExpression() { return expression; } /** * Replace certain XPath patterns that are not used or understood. * * @param expression the input regular expressions string; may not be null * @return the regular expression with all unused XPath patterns removed; never null */ protected String removeUnusedPredicates( String expression ) { assert expression != null; java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression); StringBuffer sb = new StringBuffer(); if (matcher.find()) { do { // Remove those predicates that show up in group 1 ... String predicateStr = matcher.group(0); String unusablePredicateStr = matcher.group(1); if (unusablePredicateStr != null) { predicateStr = ""; } matcher.appendReplacement(sb, predicateStr); } while (matcher.find()); matcher.appendTail(sb); expression = sb.toString(); } return expression; } /** * Remove all XPath predicates from the supplied regular expression string. * * @param expression the input regular expressions string; may not be null * @return the regular expression with all XPath predicates removed; never null */ protected String removeAllPredicatesExceptIndexes( String expression ) { assert expression != null; java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression); StringBuffer sb = new StringBuffer(); if (matcher.find()) { do { // Remove those predicates that show up in group 1 ... String predicateStr = matcher.group(0); String unusablePredicateStr = matcher.group(1); if (unusablePredicateStr != null) { predicateStr = ""; } matcher.appendReplacement(sb, predicateStr); } while (matcher.find()); matcher.appendTail(sb); expression = sb.toString(); } return expression; } /** * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions. * * @param expression the input regular expressions string; may not be null * @return the regular expression with XPath patterns replaced with regular expression fragments; never null */ protected String replaceXPathPatterns( String expression ) { assert expression != null; // replace 2 or more sequential '|' characters in an OR expression expression = expression.replaceAll("[\\|]{2,}", "|"); // if there is an empty expression in an OR expression, make the whole segment optional ... // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d" expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?"); expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?"); expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?"); // // Allow any path (that doesn't contain an explicit counter) to contain a counter, // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'... // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1"); // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)... // '[*]/' => '(\[\d+\])?/' expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional // '[]/' => '(\[\d+\])?/' expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional // '[0]/' => '(\[0\])?/' expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional // '[n]/' => '\[n\]/' expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required // Change any other end predicates to not be wrapped by braces but to begin with a slash ... // ...'[x]' => ...'/x' expression = expression.replaceAll("(?///
"), the self reference (".
"), or wildcard (" **
", "*[]
" or "*[*]
"). Combinations of these individual expressions are also * considered to match anything. * * @return true if the expression matches anything, or false otherwise */ public boolean matchesAnything() { return ANYTHING_PATTERN.matcher(expression).matches(); } public static PathExpression all() { return ALL_PATHS_EXPRESSION; } private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//"); /** * Parse a path of the form{workspaceName}:{absolutePath}
or{absolutePath}
. * * @param path the path * @return the workspace path, or null if the supplied path doesn't match any of the path patterns */ public static WorkspacePath parsePathInWorkspace( String path ) { // Extract the workspace name and absPath from the supplied path ... java.util.regex.Matcher pathMatcher = WORKSPACE_AND_PATH_PATTERN.matcher(path); if (!pathMatcher.matches()) { // No match ... return null; } String workspaceName = pathMatcher.group(2); String absolutePath = pathMatcher.group(3); if (workspaceName == null || workspaceName.length() == 0 || workspaceName.trim().length() == 0) workspaceName = null; return new WorkspacePath(workspaceName, absolutePath); } @Immutable public static class WorkspacePath { public final String workspaceName; public final String path; public WorkspacePath( String workspaceName, String path ) { this.workspaceName = workspaceName; this.path = path; } /** * {@inheritDoc} * * @see java.lang.Object#hashCode() */ @Override public int hashCode() { return path.hashCode(); } /** * {@inheritDoc} * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals( Object obj ) { if (obj == this) return true; if (obj instanceof WorkspacePath) { WorkspacePath that = (WorkspacePath)obj; if (!ObjectUtil.isEqualWithNulls(this.workspaceName, that.workspaceName)) return false; return this.path.equals(that.path); } return false; } /** * {@inheritDoc} * * @see java.lang.Object#toString() */ @Override public String toString() { return (workspaceName != null ? workspaceName : "") + ":" + path; } public WorkspacePath withWorkspaceName( String workspaceName ) { return new WorkspacePath(workspaceName, path); } public WorkspacePath withPath( String path ) { return new WorkspacePath(workspaceName, path); } } }