All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.oro.text.regex.Perl5Substitution Maven / Gradle / Ivy

/*
 * $Id: Perl5Substitution.java,v 1.13 2003/11/07 20:16:25 dfs Exp $
 *
 * ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
 *    must not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact [email protected].
 *
 * 5. Products derived from this software may not be called "Apache" 
 *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
 *    name, without prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * .
 */


package org.apache.oro.text.regex;

import java.util.*;

/**
 * Perl5Substitution implements a Substitution consisting of a
 * literal string, but allowing Perl5 variable interpolation referencing
 * saved groups in a match.  This class is intended for use with
 * {@link Util#substitute Util.substitute}.
 * 

* The substitution string may contain variable interpolations referring * to the saved parenthesized groups of the search pattern. * A variable interpolation is denoted by $1, or $2, * or $3, etc. If you want such expressions to be * interpreted literally, you should set the numInterpolations * parameter to INTERPOLATE_NONE . It is easiest to explain * what an interpolated variable does by giving an example: *

    * Suppose you have the pattern b\d+: and you want to substitute * the b's for a's and the colon for a dash in parts of * your input matching the pattern. You can do this by changing the * pattern to b(\d+): and using the substitution expression * a$1-. When a substitution is made, the $1 means * "Substitute whatever was matched by the first saved group of the * matching pattern." An input of b123: after substitution * would yield a result of a123-. But there's a little more * to be aware of. If you set the numInterpolations parameter to * INTERPOLATE_ALL, then every time a match is found, the * interpolation variables are computed relative to that match. * But if numInterpolations is set to some positive integer, then * only the interpolation variables for the first numInterpolations * matches are computed relative to the most recent match. After that, * the remaining substitutions have their variable interpolations performed * relative to the numInterpolations 'th match. So using the * previously mentioned pattern and substitution expression, if you have * an input of
    Tank b123: 85  Tank b256: 32  Tank b78: 22
    * and use a numInterpolations value of INTERPOLATE_ALL and * numSubs value (see * {@link Util#substitute Util.substitute}) * of SUBSTITUTE_ALL, then your result will be: *
    Tank a123- 85  Tank a256- 32  Tank a78- 22
    * But if you set numInterpolations to 2 and keep * numSubs with a value of SUBSTITUTE_ALL, your result is: *
    Tank a123- 85  Tank a256- 32  Tank a256- 22
    * Notice how the last substitution uses the same value for $1 * as the second substitution. *
*

* A final thing to keep in mind is that if you use an interpolation variable * that corresponds to a group not contained in the match, then it is * interpreted as the empty string. So given the regular expression from the * example, and a substitution expression of a$2-, the result * of the last sample input would be: *

Tank a- 85  Tank a- 32  Tank a- 22
* The special substitution $& will interpolate the entire portion * of the input matched by the regular expression. $0 will * do the same, but it is recommended that it be avoided because the * latest versions of Perl use $0 to store the program name rather * than duplicate the behavior of $&. * Also, the result of substituting $ followed by a non-positive integer * is undefined. In order to include a $ in a substitution, it should * be escaped with a backslash (e.g., "\\$0"). *

* Perl5 double-quoted string case modification is also supported in * the substitution. The following escape sequences are supported: *

*
\\U
make substitution uppercase until end of substitution or \\E *
\\u
make next character uppercase *
\\L
make substitution uppercase until end of substitution or \\E *
\\l
make next character uppercase *
\\E
mark the end of the case modification *
* The double backslashes are shown to remind you that to make a * backslash get past Java's string handling and appear as a backslash * to the substitution, you must escape the backslash. * * @version @version@ * @since 1.1 * @see Substitution * @see Util * @see Util#substitute * @see Substitution * @see StringSubstitution */ public class Perl5Substitution extends StringSubstitution { /** * A constant used when creating a Perl5Substitution indicating that * interpolation variables should be computed relative to the most * recent pattern match. */ public static final int INTERPOLATE_ALL = 0; /** * A constant used when creating a Perl5Substitution indicating that * interpolation variables should be interpreted literally, effectively * disabling interpolation. */ public static final int INTERPOLATE_NONE = -1; /** * The initial size and unit of growth for the * {@link #_subOpCodes _subOpCodes} array. */ private static final int __OPCODE_STORAGE_SIZE = 32; /** * The maximum number of groups supported by interpolation. */ private static final int __MAX_GROUPS = Character.MAX_VALUE; /** * A constant declaring opcode for copy operation. */ static final int _OPCODE_COPY = -1; /** * A constant declaring opcode for lowercase char operation. */ static final int _OPCODE_LOWERCASE_CHAR = -2; /** * A constant declaring opcode for uppercase char operation. */ static final int _OPCODE_UPPERCASE_CHAR = -3; /** * A constant declaring opcode for lowercase mode operation. */ static final int _OPCODE_LOWERCASE_MODE = -4; /** * A constant declaring opcode for lowercase mode operation. */ static final int _OPCODE_UPPERCASE_MODE = -5; /** * A constant declaring opcode for lowercase mode operation. */ static final int _OPCODE_ENDCASE_MODE = -6; int _numInterpolations; int[] _subOpcodes; int _subOpcodesCount; char[] _substitutionChars; transient String _lastInterpolation; private static final boolean __isInterpolationCharacter(char ch) { return (Character.isDigit(ch) || ch == '&'); } private void __addElement(int value) { int len = _subOpcodes.length; if (_subOpcodesCount == len) { int[] newarray = new int[len + __OPCODE_STORAGE_SIZE]; System.arraycopy(_subOpcodes, 0, newarray, 0, len); _subOpcodes = newarray; } _subOpcodes[_subOpcodesCount++] = value; } private void __parseSubs(String sub) { boolean saveDigits, escapeMode, caseMode; int posParam; int offset; char[] subChars = _substitutionChars = sub.toCharArray(); int subLength = subChars.length; _subOpcodes = new int[__OPCODE_STORAGE_SIZE]; _subOpcodesCount = 0; posParam = 0; offset = -1; saveDigits = false; escapeMode = false; caseMode = false; for (int current = 0; current < subLength; current++) { char c = subChars[current]; char nextc; int next = current + 1; // Save digits if (saveDigits) { int digit = Character.digit(c, 10); if (digit > -1) { if (posParam <= __MAX_GROUPS) { posParam *= 10; posParam += digit; } if (next == subLength) { __addElement(posParam); } continue; } else if(c == '&') { if(/*current > 0 &&*/subChars[current - 1] == '$') { __addElement(0); posParam = 0; saveDigits = false; continue; } } __addElement(posParam); posParam = 0; saveDigits = false; } if ((c != '$' && c != '\\') || escapeMode) { escapeMode = false; if (offset < 0) { offset = current; __addElement(_OPCODE_COPY); __addElement(offset); } if (next == subLength) { __addElement(next - offset); } continue; } if (offset >= 0) { __addElement(current - offset); offset = -1; } // Only do positional and escapes if we have a next char if (next == subLength) continue; nextc = subChars[next]; // Positional params if (c == '$') { saveDigits = __isInterpolationCharacter(nextc); } else if (c == '\\') { // Escape codes if (nextc == 'l') { if (!caseMode){ __addElement(_OPCODE_LOWERCASE_CHAR); current++; } } else if (nextc == 'u') { if (!caseMode) { __addElement(_OPCODE_UPPERCASE_CHAR); current++; } } else if (nextc == 'L') { __addElement(_OPCODE_LOWERCASE_MODE); current++; caseMode = true; } else if (nextc == 'U') { __addElement(_OPCODE_UPPERCASE_MODE); current++; caseMode = true; } else if (nextc == 'E') { __addElement(_OPCODE_ENDCASE_MODE); current++; caseMode = false; } else { escapeMode = true; } } } } String _finalInterpolatedSub(MatchResult result) { StringBuffer buffer = new StringBuffer(10); _calcSub(buffer, result); return buffer.toString(); } void _calcSub(StringBuffer buffer, MatchResult result) { int size, offset, count, caseMode; char[] sub, str, match; int[] subOpcodes = _subOpcodes; caseMode = 0; str = _substitutionChars; match = result.group(0).toCharArray(); size = _subOpcodesCount; for (int element = 0; element < size; element++) { int value = subOpcodes[element]; // If we have a group, set up interpolation, else // interpret op code. if(value >= 0 && value < result.groups()) { int end, len; offset = result.begin(value); if (offset < 0) continue; end = result.end(value); if (end < 0) continue; len = result.length(); if (offset >= len || end > len || offset >= end) continue; count = end - offset; sub = match; } else if (value == _OPCODE_COPY) { element++; if (element >= size) continue; offset = subOpcodes[element]; element++; if (element >= size) continue; count = subOpcodes[element]; sub = str; } else if (value == _OPCODE_LOWERCASE_CHAR || value == _OPCODE_UPPERCASE_CHAR) { if (caseMode != _OPCODE_LOWERCASE_MODE && caseMode != _OPCODE_UPPERCASE_MODE) caseMode = value; continue; } else if (value == _OPCODE_LOWERCASE_MODE || value == _OPCODE_UPPERCASE_MODE) { caseMode = value; continue; } else if (value == _OPCODE_ENDCASE_MODE) { caseMode = 0; continue; } else continue; // Apply modes to buf if (caseMode == _OPCODE_LOWERCASE_CHAR) { buffer.append(Character.toLowerCase(sub[offset++])); buffer.append(sub, offset, --count); caseMode = 0; } else if (caseMode == _OPCODE_UPPERCASE_CHAR) { buffer.append(Character.toUpperCase(sub[offset++])); buffer.append(sub, offset, --count); caseMode = 0; } else if (caseMode == _OPCODE_LOWERCASE_MODE) { for (int end = offset + count; offset < end; ) { buffer.append(Character.toLowerCase(sub[offset++])); } } else if (caseMode == _OPCODE_UPPERCASE_MODE) { for (int end = offset + count; offset < end; ) { buffer.append(Character.toUpperCase(sub[offset++])); } } else buffer.append(sub, offset, count); } } /** * Default constructor initializing substitution to a zero length * String and the number of interpolations to * {@link #INTERPOLATE_ALL}. */ public Perl5Substitution() { this("", INTERPOLATE_ALL); } /** * Creates a Perl5Substitution using the specified substitution * and setting the number of interpolations to * {@link #INTERPOLATE_ALL}. *

* @param substitution The string to use as a substitution. */ public Perl5Substitution(String substitution) { this(substitution, INTERPOLATE_ALL); } /** * Creates a Perl5Substitution using the specified substitution * and setting the number of interpolations to the specified value. *

* @param substitution The string to use as a substitution. * @param numInterpolations * If set to INTERPOLATE_NONE, interpolation variables are * interpreted literally and not as references to the saved * parenthesized groups of a pattern match. If set to * INTERPOLATE_ALL , all variable interpolations * are computed relative to the pattern match responsible for * the current substitution. If set to a positive integer, * the first numInterpolations substitutions have * their variable interpolation performed relative to the * most recent match, but the remaining substitutions have * their variable interpolations performed relative to the * numInterpolations 'th match. */ public Perl5Substitution(String substitution, int numInterpolations) { setSubstitution(substitution, numInterpolations); } /** * Sets the substitution represented by this Perl5Substitution, also * setting the number of interpolations to * {@link #INTERPOLATE_ALL}. * You should use this method in order to avoid repeatedly allocating new * Perl5Substitutions. It is recommended that you allocate a single * Perl5Substitution and reuse it by using this method when appropriate. *

* @param substitution The string to use as a substitution. */ public void setSubstitution(String substitution) { setSubstitution(substitution, INTERPOLATE_ALL); } /** * Sets the substitution represented by this Perl5Substitution, also * setting the number of interpolations to the specified value. * You should use this method in order to avoid repeatedly allocating new * Perl5Substitutions. It is recommended that you allocate a single * Perl5Substitution and reuse it by using this method when appropriate. *

* @param substitution The string to use as a substitution. * @param numInterpolations * If set to INTERPOLATE_NONE, interpolation variables are * interpreted literally and not as references to the saved * parenthesized groups of a pattern match. If set to * INTERPOLATE_ALL , all variable interpolations * are computed relative to the pattern match responsible for * the current substitution. If set to a positive integer, * the first numInterpolations substitutions have * their variable interpolation performed relative to the * most recent match, but the remaining substitutions have * their variable interpolations performed relative to the * numInterpolations 'th match. */ public void setSubstitution(String substitution, int numInterpolations) { super.setSubstitution(substitution); _numInterpolations = numInterpolations; if(numInterpolations != INTERPOLATE_NONE && (substitution.indexOf('$') != -1 || substitution.indexOf('\\') != -1)) __parseSubs(substitution); else _subOpcodes = null; _lastInterpolation = null; } /** * Appends the substitution to a buffer containing the original input * with substitutions applied for the pattern matches found so far. * See * {@link Substitution#appendSubstitution Substitution.appendSubstition()} * for more details regarding the expected behavior of this method. *

* @param appendBuffer The buffer containing the new string resulting * from performing substitutions on the original input. * @param match The current match causing a substitution to be made. * @param substitutionCount The number of substitutions that have been * performed so far by Util.substitute. * @param originalInput The original input upon which the substitutions are * being performed. This is a read-only parameter and is not modified. * @param matcher The PatternMatcher used to find the current match. * @param pattern The Pattern used to find the current match. */ public void appendSubstitution(StringBuffer appendBuffer, MatchResult match, int substitutionCount, PatternMatcherInput originalInput, PatternMatcher matcher, Pattern pattern) { if(_subOpcodes == null) { super.appendSubstitution(appendBuffer, match, substitutionCount, originalInput, matcher, pattern); return; } if(_numInterpolations < 1 || substitutionCount < _numInterpolations) _calcSub(appendBuffer, match); else { if(substitutionCount == _numInterpolations) _lastInterpolation = _finalInterpolatedSub(match); appendBuffer.append(_lastInterpolation); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy