All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.unkrig.commons.text.pattern.Substitutor Maven / Gradle / Ivy


/*
 * de.unkrig.commons - A general-purpose Java class library
 *
 * Copyright (c) 2016, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.commons.text.pattern;

import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.unkrig.commons.lang.protocol.FunctionWhichThrows;
import de.unkrig.commons.lang.protocol.NoException;
import de.unkrig.commons.lang.protocol.TransformerWhichThrows;
import de.unkrig.commons.nullanalysis.Nullable;

/**
 * Replaces pattern matches in a stream of strings ("chunks"). Matches are allowed across chunks; thus, this
 * transformer is stateful.
 * 

* As the substitutor consumes chunks, it produces a sequence of strings, and the concatenation of * the chunks read equals the concatenation of the produced strings, except that all matches of the * pattern are substituted with the replacements generated by the matchReplacer. *

*

* Iff the input to the transformer is {@code ""} (the empty string), then the "rest" of any pending matches is * returned. This is sort of a "flush" operation. *

*

* The substitutor attempts to keep as little characters as possible in memory, which make it possible to find * matches in documents that are too large to load into memory. *

*

* If you plan to use "look-behinds" (the most common of which is "^"), bear in mind that the look-behind memory is * limited; by default to {@value #DEFAULT_LOOKBEHIND_LIMIT}. If you need more look-behind space, use {@link * #Substitutor(Pattern, FunctionWhichThrows, int)}. *

*

* Also bear in mind that, specifically when using "greedy quantifiers", that it may happen quite easily that the * entire input must be read into memory, only to find that there is no match. E.g. the pattern {@code "a.*b"}, as * soon as it hits the letter "a", will load all the remaining text into memory, because there might still come * (another) "b". *

* * @param The exception type that the "match replacer" (see {@link #Substitutor(Pattern, FunctionWhichThrows)} * is allowed to throw; use {@link NoException} if your match replacer does not throw any (checked) * exceptions */ public class Substitutor implements TransformerWhichThrows { /** * The number of characters that can safely be used for look-behind, unless a different value is configured through * {@link #Substitutor(Pattern, FunctionWhichThrows, int)}. */ public static final int DEFAULT_LOOKBEHIND_LIMIT = 10; // CONFIGURATION private final Pattern pattern; private final FunctionWhichThrows matchReplacer; private final int lookBehindLimit; /** * Contains a suffix of the input char sequence. */ StringBuilder buffer = new StringBuilder(); /** * Offset in {@link #buffer} */ int start; /** * Number of substitutions executed so far. */ private int substitutionCount; private int offsetDelta; /** * @deprecated The {@link Matcher} that is fed to the matchReplacer has incorrect offsets; use {@link * #create(Pattern, FunctionWhichThrows)} instead */ @Deprecated public Substitutor( Pattern pattern, FunctionWhichThrows matchReplacer ) { this(pattern, matchReplacer, Substitutor.DEFAULT_LOOKBEHIND_LIMIT); } /** * @deprecated The {@link Matcher} that is fed to the matchReplacer has incorrect offsets; use {@link * #create(Pattern, FunctionWhichThrows, int)} instead */ @Deprecated public Substitutor( Pattern pattern, FunctionWhichThrows matchReplacer, int lookBehindLimit ) { this.pattern = pattern; this.matchReplacer = matchReplacer; this.lookBehindLimit = lookBehindLimit; } /** * Equivalent with {@link #create(Pattern, FunctionWhichThrows, int) create}{@code (}pattern{@code ,} * matchReplacer{@code ,} {@link #DEFAULT_LOOKBEHIND_LIMIT}{@code )}. */ public static Substitutor create( Pattern pattern, FunctionWhichThrows matchReplacer ) { return Substitutor.create(pattern, matchReplacer, Substitutor.DEFAULT_LOOKBEHIND_LIMIT); } /** * @return A {@link Substitutor} that substitutes all matches of the pattern through the * matchReplacer */ public static Substitutor create( Pattern pattern, final FunctionWhichThrows matchReplacer, int lookBehindLimit ) { final Substitutor[] s = new Substitutor[1]; Substitutor result = new Substitutor(pattern, new FunctionWhichThrows() { @Override @Nullable public CharSequence call(final @Nullable Matcher m) throws EX { assert m != null; return matchReplacer.call(new MatchResult() { // SUPPRESS CHECKSTYLE LineLength:7 @Override public int start(int group) { return m.start(group) + s[0].offsetDelta; } @Override public int start() { return m.start() + s[0].offsetDelta; } @Override public int groupCount() { return m.groupCount(); } @Override public String group(int group) { return m.group(group); } @Override public String group() { return m.group(); } @Override public int end(int group) { return m.end(group) + s[0].offsetDelta; } @Override public int end() { return m.end() + s[0].offsetDelta; } @Override public String toString() { String s = "[match(" + m.start() + '-' + m.end() + ")=" + this.group(); for (int i = 1; i <= this.groupCount(); i++) { s += ", group#" + i + '(' + m.start(i) + '-' + m.end(i) + ")=" + m.group(i); } s += ']'; return s; } }); } }, lookBehindLimit); s[0] = result; return result; } /** * Substitutes all matches in the subject and returns it. If there is a "partial match" at the * end of the subject, then only a prefix of the result is returned, and the suffix is processed as part of * following invocations. */ @Override public CharSequence transform(CharSequence in) throws EX { if (in.length() == 0) return this.flush(); this.buffer.append(in); final StringBuilder result = new StringBuilder(); final Matcher m = this.pattern.matcher(this.buffer); m.useTransparentBounds(true); m.useAnchoringBounds(false); for (;;) { m.region(this.start, this.buffer.length()); if (m.lookingAt()) { if (m.hitEnd()) { // E.g. "A.*B" => "AxxxBxx" break; } // E.g. "A" => "Axxx" CharSequence replacement = this.matchReplacer.call(m); if (replacement == null) { result.append(this.buffer.charAt(this.start++)); } else { this.substitutionCount++; result.append(replacement); this.substitutionCount++; if (m.end() == m.start()) { result.append(this.buffer.charAt(this.start++)); } else { this.start = m.end(); } } } else { if (m.hitEnd()) { // E.g. "Axxxxxx" => "Axxx" break; } // E.g. "A" => "Bxx" if (this.start == this.buffer.length()) break; result.append(this.buffer.charAt(this.start++)); } } if (this.start > this.lookBehindLimit) { // Truncate the buffer to save memory. this.buffer.delete(0, this.start - this.lookBehindLimit); this.start = this.lookBehindLimit; } if (this.buffer.capacity() > 10 * this.buffer.length()) this.buffer.trimToSize(); return result.toString(); } /** * @return The number of substitutions executed so far, i.e. the number of invocations of the * matchReplacer that returned a non-{@code null} value */ public int substitutionCount() { return this.substitutionCount; } private CharSequence flush() throws EX { if (this.buffer.length() == 0) return ""; Matcher m = this.pattern.matcher(this.buffer); m.useTransparentBounds(true); m.useAnchoringBounds(false); m.region(this.start, this.buffer.length()); if (!m.find(this.start)) { // No match in "the rest" - just return "the rest". final String result = this.buffer.substring(this.start); this.buffer.setLength(0); this.start = 0; return result; } StringBuilder result = new StringBuilder(); do { CharSequence replacement = this.matchReplacer.call(m); if (replacement != null) { result.append(this.buffer, this.start, m.start()).append(replacement); this.start = m.end(); this.substitutionCount++; } if (m.start() == m.end()) { // Special case: Zero-length match. if (this.start == this.buffer.length()) break; result.append(this.buffer.charAt(this.start++)); } } while (m.find(this.start)); result.append(this.buffer, this.start, this.buffer.length()); this.buffer.setLength(0); this.start = 0; return result.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy