de.unkrig.commons.text.pattern.Pattern2 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of de-unkrig-commons Show documentation
A versatile Java(TM) library that implements many useful container and utility classes.
There is a newer version: 1.1.12

/*
 * de.unkrig.commons - A general-purpose Java class library
 *
 * Copyright (c) 2011, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. The name of the author may not be used to endorse or promote products derived from this software without
 *       specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.commons.text.pattern;

import java.io.File;
import java.util.regex.Pattern;

/**
 * This class extends the concepts of the JDK {@link Pattern java.util.regex.Pattern} class with a new {@link
 * #WILDCARD} compilation flag, which modifies the pattern compilation such that it combines the well-known
 * wildcard pattern matching with the power of regular
 * expressions.
 *
 * @see #compile(String, int)
 */
public final
class Pattern2 {

    private Pattern2() {}

    /**
     * Modifies the pattern compilation as follows:
     * 
     * The meaning of the '*' and '?' metacharacters is now different, and '.' is no longer a metacharacter.
     * 

     * '/' is now a metacharacter, i.e. to include it literally in the pattern, it must be escaped with a backslash.
     * 

     * The semantics of '*', '?' and '.' are as follows:
     * 
     *   
     *     
     *     
     *   
     *   
     *     
     *   
     *   
     *     
     *     
     *   
     *   
     *     
     *     
     *   
     *   
     *     
     *     
     *   
     *   
     *     
     *     
     *   
     *   
     *     
     *     
     *   
     *   
     *     
     *     
     *   
     * Construct Matches
Wildcards
* Zero or more characters except '/', the file separator and '!'
** Zero or more characters except '!'
*** Zero or more characters
? Any character except '/', the file separator and '!'
. The '.'
/ 
     *       '/' or the system-dependent file separator (see {@link java.io.File#separatorChar separatorChar})
     *     
     * Naturally, '*' is no longer the regex quantifier '*', so if you need to quantify 'zero or more', then you'd have
     * to write '{0,}'. Similarly, to quantify 'zero or one', you can no longer write '{@code ?}', but
     * must use '{0,1}'.
     */
    public static final int WILDCARD = 0x20000000;

    /**
     * Like {@link Pattern#compile(String,int)}, but with support for the {@link #WILDCARD} flag.
     * 
     *   Notice that iff {@link #WILDCARD} is given, then {@link #toString()} returns the regular expression
     *   that was generated from the wildcard pattern (and not the wildcard pattern, as you'd probably expect).
     * 
     *
     * @see #WILDCARD
     * @see Pattern#CANON_EQ
     * @see Pattern#CASE_INSENSITIVE
     * @see Pattern#COMMENTS
     * @see Pattern#DOTALL
     * @see Pattern#LITERAL
     * @see Pattern#MULTILINE
     * @see Pattern#UNICODE_CASE
     * @see Pattern#UNIX_LINES
     */
    public static Pattern
    compile(String pattern, int flags) {

        if ((flags & (Pattern.LITERAL | Pattern2.WILDCARD)) != Pattern2.WILDCARD) {
            return Pattern.compile(pattern, flags);
        }

        String metaCharacters = "*?./{";
        for (
            int idx = Pattern2.findMeta(metaCharacters, pattern, 0);
            idx != pattern.length();
            idx = Pattern2.findMeta(metaCharacters, pattern, idx)
        ) {
            switch (pattern.charAt(idx)) {

            case '*':
                {
                    String s = pattern.substring(idx);
                    if (s.startsWith("***")) {
                        pattern = pattern.substring(0, idx) + ".*" + pattern.substring(idx + 3);
                        idx     += 2;
                    } else
                    if (s.startsWith("**")) {
                        pattern = pattern.substring(0, idx) + "[^!]*" + pattern.substring(idx + 2);
                        idx     += 5;
                    } else
                    if (File.separatorChar != '/') {
                        pattern = (
                            pattern.substring(0, idx)
                            + "[^/\\"
                            + File.separatorChar
                            + "!]*"
                            + pattern.substring(idx + 1)
                        );
                        idx     += 8;
                    } else
                    {
                        pattern = pattern.substring(0, idx) + "[^/!]*" + pattern.substring(idx + 1);
                        idx     += 6;
                    }
                }
                break;

            case '?':
                if (File.separatorChar != '/') {
                    pattern = (
                        pattern.substring(0, idx)
                        + "[^/\\"
                        + File.separatorChar
                        + "!]"
                        + pattern.substring(idx + 1)
                    );
                    idx     += 7;
                } else {
                    pattern = pattern.substring(0, idx) + "[^/!]" + pattern.substring(idx + 1);
                    idx     += 5;
                }
                break;

            case '.':
                pattern = pattern.substring(0, idx) + "\\." + pattern.substring(idx + 1);
                idx     += 2;
                break;

            case '/':
                if (File.separatorChar != '/') {
                    pattern = (
                        pattern.substring(0, idx)
                        + "[/\\"
                        + File.separatorChar
                        + "]"
                        + pattern.substring(idx + 1)
                    );
                    idx     += 5;
                } else {
                    idx++;
                }
                break;

            case '{':
                {
                    if (pattern.regionMatches(idx, "{0,1}", 0, 5)) {
                        pattern = pattern.substring(0, idx) + "?" + pattern.substring(idx + 5);
                        idx++;
                    } else
                    if (pattern.regionMatches(idx, "{0,}", 0, 4)) {
                        pattern = pattern.substring(0, idx) + "*" + pattern.substring(idx + 4);
                        idx++;
                    } else
                    {
                        idx++;
                    }
                }
                break;

            default:
                throw new IllegalStateException();
            }
        }

        return Pattern.compile(pattern, flags);
    }

    /**
     * Splits the given string into "pattern" and "replacement". The "pattern" is the text before the first
     * non-escaped equals sign ("="), the "replacement" is the text after that equals sign.
     * 
     *   Example:
     * 
     * 
     *   {@code "(foo)=$1.bak"} results in { "(foo)", "$1.bak" }.
     * 
     * 
     *   Iff there is no non-escaped equals sign, then the resulting replacement is {@code null}.
     * 
     *
     * @return An array of { pattern, replacement }
     */
    public static String[]
    parsePatternAndReplacement(String pattern) {

        String replacement = null;

        int idx = Pattern2.findMeta("=", pattern, 0);
        if (idx != pattern.length()) {
            replacement = pattern.substring(idx + 1);
            pattern     = pattern.substring(0, idx);
        }

        return new String[] { pattern, replacement };
    }

    /**
     * Finds the next unescaped occurrence of one of the {@code metaCharacters} within {@code subject}, starting at
     * position {@code offset}. Metacharacters can be escaped by backslashes or by '{@code \Q ... \E}'.
     *
     * @return The position of the next meta character, or {@code subject.length()} iff no meta character is found
     */
    public static int
    findMeta(String metaCharacters, String subject, int offset) {
        int     cc    = 0;     // Character class count (character classes my be nested).
        boolean q     = false; // Inside curly-brace quantifier, e.g. "{0,}"
        int     state = 0;
        for (; offset != subject.length(); offset++) {
            char c = subject.charAt(offset);
            switch (state) {
            case 0:
                if (c == '\\') {
                    state = 1;
                } else
                if (c == '[') {
                    cc++;
                } else
                if (c == ']' && cc > 0) {
                    cc--;
                } else
                if (cc > 0) {
                    ;
                } else
                if (metaCharacters.indexOf(c) != -1 && !q) {
                    return offset;
                } else
                if (c == '{') {
                    q = true;
                } else
                if (c == '}') {
                    q = false;
                }
                break;
            case 1: // After backslash.
                state = c == 'Q' ? 2 : 0;
                break;
            case 2: // In quoted section.
                if (c == '\\') state = 3;
                break;
            case 3: // Inquoted section, after backslash.
                if (c == 'E') {
                    state = 0;
                } else
                if (c != '\\') {
                    state = 2;
                }
                break;
            default:
                throw new IllegalStateException();
            }
        }
        return offset;
    }
}
Construct	Matches
Wildcards
`*`	Zero or more characters except '/', the file separator and '!'
`**`	Zero or more characters except '!'
`***`	Zero or more characters
`?`	Any character except '/', the file separator and '!'
`.`	The '.'
`/`	* '/' or the system-dependent file separator (see {@link java.io.File#separatorChar separatorChar}) *