com.day.text.Replace Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of uber-jar Show documentation
There is a newer version: 6.5.21
/*************************************************************************
 *
 * ADOBE CONFIDENTIAL
 * __________________
 *
 *  Copyright 2012 Adobe Systems Incorporated
 *  All Rights Reserved.
 *
 * NOTICE:  All information contained herein is, and remains
 * the property of Adobe Systems Incorporated and its suppliers,
 * if any.  The intellectual and technical concepts contained
 * herein are proprietary to Adobe Systems Incorporated and its
 * suppliers and are protected by trade secret or copyright law.
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from Adobe Systems Incorporated.
 **************************************************************************/
package com.day.text;

import java.util.ArrayList;
import java.util.Iterator;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The Replace class implements the string replacement
 * functionality formerly coded in the base/base_replace.c C
 * file. This class acts as a data container for the concrete replacement
 * definition and also does the actual replacement for the String/UString ECMA
 * host object and also for Java clients.
 * 
 * The usage of this class is relatively simple :
 * 

 * Acquire an instance through the default constructor
 * 
Add Pattern strings through the {@link #addPattern(String, String)},
 * {@link #addPattern(int, String, String)} or {@link #addPatterns(String[][])}
 * calls
 * 
Optionally set replacement flags
 * 
call {@link #replace(String, int)} for each string you want to work on
 * 
 * 
 * The flags for the replacement are defined as follows :
 * 

 * {@link #REPLACE_IGNORE_HTML}
 * 
Don't replace anything contained within a tag enclosed in < and >,
 * resp.
 * 
{@link #REPLACE_WHOLE_WORD}
 * 
Only replace occurrences of patterns when they form standalone words.
 * 
 * 
 * The replacement algorithm works in three steps :
 * 

 * All occurrences of the patterns are looked for in the string
 * 
Within theses occurrences, collisions are resolved using a weighting
 * algorithm which is based on the pattern length using a weight reference.
 * 
For each occurrence not invalidated through collision detection, the
 * pattern is replaced by the replacement. The rest of the input String is
 * copied to the destination unmodified.
 * 
 */
public class Replace {

    /** default log */
    private static final Logger log = LoggerFactory.getLogger(Replace.class);

    /**
     * Replacment flag indicating no special treatment
     */
    public static final int REPLACE_NONE = 0x00;

    /**
     * Replacement flag to ignore any strings occuring within HTML tags
     */
    public static final int REPLACE_IGNORE_HTML = 0x01;

    /**
     * Replacement flag to only replace search words occuring as stand- alone
     * words
     */
    public static final int REPLACE_WHOLE_WORD = 0x02;

    /**
     * Actual replacement flags
     */
    private int flags;

    /**
     * The list of configured replacement patterns
     */
    private ArrayList patterns;

    /**
     * Default constructor for the replacement object
     */
    public Replace() {
        this.flags = REPLACE_NONE;
        this.patterns = new ArrayList();
    }

    /**
     * calls {@link #replace(String, int)} with wref=0
     */
    public String replace(String source) {
        return replace(source, 0);
    }

    /**
     * The real replacement of the patterns within the input takes place here.
     * 
     * @param source The String to do the work in
     * @param wref Reference weight (?)
     */
    public String replace(String source, int wref) {
        log.debug("replace: String \"{}\" with weight reference {}", source,
            new Integer(wref));

        // Return empty string if source is empty
        if (source == null || source.length() == 0) return "";

        // Return source if there is no pattern
        if (patterns.size() == 0) return source;

        // Get the occurrences list
        char[] src = source.toCharArray();
        Occurrence[] occ = getOccurrences(src);

        // Process the occurrences
        processOccurrences(occ, wref);

        // Now let the replacment occurr
        return doReplace(src, occ);
    }

    // ---------- property accessors
    // --------------------------------------------

    /**
     * Set the indicated replacement flags
     * 
     * @param flags The flags to set
     */
    public void setFlags(int flags) {
        this.flags = flags;
    }

    /**
     * Returns the current flags
     * 
     * @return the current flags
     */
    public int getFlags() {
        return flags;
    }

    /**
     * Insert the pattern at the indicated position in the internal pattern list
     * 
     * @param pos The position to insert the pattern at
     * @param pattern The pattern String for the new pattern
     * @param replacement The replacement String for the new pattern
     */
    public void addPattern(int pos, String pattern, String replacement) {
        log.debug(
            "addPattern: adding pattern \"{}\" to be replaced by \"{}\" as #{}",
            new Object[] { pattern, replacement, new Integer(pos) });
        patterns.add(pos, new Pattern(pattern, replacement));
    }

    /**
     * Append the pattern to the internal pattern list
     * 
     * @param pattern The pattern String for the new pattern
     * @param replacement The replacement String for the new pattern
     */
    public void addPattern(String pattern, String replacement) {
        addPattern(patterns.size(), pattern, replacement);
    }

    /**
     * Append all the pattern/replacement String pairs to the list. Each entry
     * is supposed to contain at least two entries.
     * 
     * @param prPairs An Array of pattern/replacement String pairs.
     */
    public void addPatterns(String[][] prPairs) {
        for (int i = 0; i < prPairs.length; i++) {
            String[] pair = prPairs[i];

            if (pair == null || pair.length < 2) {
                // ignore empty or imcomplete pattern pair
                continue;
            }

            addPattern(patterns.size(), pair[0], pair[1]);
        }
    }

    /**
     * Returns an iterator over the existing patterns
     * 
     * @return an iterator over the existing patterns
     */
    public Iterator getPatterns() {
        return patterns.iterator();
    }

    // ---------- internal
    // ------------------------------------------------------

    /**
     * Check whether a character represents a word boundary in the sense of the
     * Replace class. Note that this need not be the same as
     * {@link Character.isDelimiter(char)}
     * 
     * @param c The character to check
     * @return True if the character is a word delimiter
     */
    private final boolean isWordDelim(char c) {
        return (c == '\0' || c == '/' || c == ' ' || c == '\r' || c == '\n'
            || c == '\t' || c == '.' || c == ';' || c == '(' || c == ')'
            || c == ',' || c == '<' || c == '\"' || c == '\\');
    }

    /**
     * Walk through the source String and mark all occurrences of the patterns
     * 
     * @param source The source String to analize
     * @return The List of occurrences to be later replaced
     */
    private Occurrence[] getOccurrences(char[] src) {
        log.debug(
            "getOccurrences: Getting all occurrences according to the flags {}",
            Integer.toHexString(flags));

        boolean insidetag = false;
        boolean reinit = true;
        ArrayList occ = new ArrayList();

        int srclen = src.length;

        for (int i = 0; i < srclen; i++) {
            char c = src[i];

            // Check whether we start a tag to ignore
            if (c == '<' && ((flags & REPLACE_IGNORE_HTML) != 0) && !insidetag) {
                log.debug("Starting an ignored HTML tag at pos " + i);
                insidetag = true;
                continue;
            }

            // Check whether we are within an ignored tag
            if (insidetag) {

                // Possibly end the tag
                if (c == '>') {
                    log.debug("Ending an ignored HTML tag at pos " + i);
                    insidetag = false;
                    reinit = true;
                }
                continue;
            }

            // Now loop through the patterns
            Iterator pIter = patterns.iterator();
            while (pIter.hasNext()) {
                Pattern pat = (Pattern) pIter.next();

                // ignore empty patterns
                if (pat.len == 0) continue;

                // Get the start of a pattern
                int pos = reinit ? 0 : pat.pos;
                while (pos > -1 && pat.pattern[pos] != c) {
                    pos = pat.shift[pos];
                }
                pos++;

                if (pos >= pat.len) {
                    int beg = i - pos + 1;
                    int end = i - pos + pat.len; // last char in match
                    boolean valid = true;

                    // check word boundaries
                    if ((flags & REPLACE_WHOLE_WORD) != 0) {
                        char b = (beg > 0) ? src[beg - 1] : '\0';
                        char e = src[end + 1];
                        valid = (isWordDelim(b) && isWordDelim(e));
                    }

                    if (valid) {
                        log.debug("Found pattern \"{}\" at position {}",
                            pat.pattern, new Integer(beg + 1));
                        occ.add(new Occurrence(beg, end, pat));
                    } else {
                        log.debug("Ignoring pattern \"{}\" at position {}",
                            pat.pattern, new Integer(beg + 1));
                    }

                    pos = pat.shift[pos];
                }

                pat.pos = pos;
            }

            reinit = false;

        }

        log.debug("Found " + occ.size() + " occurrences");

        return (Occurrence[]) occ.toArray(new Occurrence[occ.size()]);
    }

    /**
     * Process the occurrences and handle collisions through pattern weighting
     * 
     * @param occ The Occurrences to check for collisions
     * @param wref The reference weight
     * @return ???
     */
    private void processOccurrences(Occurrence[] occ, int wref) {
        int awref = java.lang.Math.abs(wref);
        int numOccs = occ.length;

        log.debug("Detecting collisions on " + occ.length + " patterns");

        for (int i = 0; i < numOccs; i++) {

            // if an occurrence has been invalidated, ignore
            if (occ[i].pattern == null) {
                log.debug("The occurrence has been invalidated: #{}, [{}-{}]",
                    new Object[] { new Integer(i), new Integer(occ[i].begin),
                        new Integer(occ[i].end) });
                continue;
            }

            int p = i + 1;
            int best = i;

            int weight = java.lang.Math.abs(occ[i].pattern.weight - awref);

            while (p < numOccs && occ[p].begin <= occ[i].end) {

                if (occ[p].pattern != null
                    && ((occ[i].end >= occ[p].begin && occ[i].end <= occ[p].end) || (occ[p].begin >= occ[i].begin && occ[p].begin <= occ[i].end))) {

                    int pweight = java.lang.Math.abs(occ[p].pattern.weight
                        - awref);

                    // sign of wref determines comparison
                    if (wref < 0) {
                        if (pweight < weight) {
                            weight = pweight;
                            occ[best].pattern = null; // invalidate looser
                            best = p;
                        } else {
                            occ[p].pattern = null; // invalidate looser
                        }

                        if (pweight > weight) {
                            weight = pweight;
                            occ[best].pattern = null; // invalidate looser
                            best = p;
                        } else {
                            occ[p].pattern = null; // invalidate looser
                        }
                    }
                }

                p++;
            }

        }
    }

    /**
     * Do the actual replacment of all the pattern occurrences found in the
     * source String.
     * 
     * @param source The source String
     * @param occ The occurrences with possibly invalidated occurrences
     */
    private String doReplace(char[] src, Occurrence[] occ) {
        StringBuffer dest = new StringBuffer(src.length);
        int p = 0;

        log.debug("Doing the replacment of all valid occurrences");

        for (int i = 0; i < occ.length; i++) {

            // handle valid occurrences only
            if (occ[i].pattern != null) {

                log.debug("Replacing \"{}\" by \"{}\" @{}", new Object[] {
                    occ[i].pattern.pattern, occ[i].pattern.replace,
                    new Integer(occ[i].begin) });

                // Append unmatched part before occurrence
                dest.append(src, p, occ[i].begin - p);

                // Replace occurrence
                dest.append(occ[i].pattern.replace);

                // Move on within the string
                p = occ[i].end + 1;
            }

        }

        // Append last part of the source string
        if (p < src.length) {
            dest.append(src, p, src.length - p);
        }

        return dest.toString();
    }

    // ---------- Internal data helper classes
    // ----------------------------------

    /**
     * The Pattern class abstracts the notion of a replacement
     * pattern. This pattern supports an optimized comparison algorithm in that
     * it is not always needed to re-check all of the pattern, if the pattern
     * has common subpatterns, such as 'tata'.
     */
    private static final class Pattern {

        /**
         * The search pattern to be replaced
         */
        public char[] pattern;

        /**
         * The replacement String
         */
        public String replace;

        /**
         * New position offsets for next comparison. This implements the
         * comparison optimazation step.
         */
        public int[] shift;

        /**
         * Next position within the pattern to compare. This position is
         * dependant of the already matched part and the shift table. Usually if
         * a match occurrs, this position is incremented. If there is no match,
         * the position is set according to the shift table entry for the
         * position at which the non-match occurred.
         */
        public int pos;

        /**
         * Length of the pattern string
         */
        public int len;

        /**
         * Weight of the pattern. Currently the weight of the pattern is
         * the same as its character length.
         */
        public int weight;

        /**
         * Create and analize a new pattern. During the analysis the shift
         * table is built according to the inner structure of the pattern.
         *
         * @param pattern The pattern String to be replaced
         * @param replace The String replacement
         */
        public Pattern(String pattern, String replace) {
            // init Pattern
            this.pattern = pattern.toCharArray();
            this.len = pattern.length();
            this.replace = replace;

            this.shift = new int[len + 1]; // Prepare the shift array
            this.weight = len; // The pattern weight is the length

            // Prepare shifts
            int j = shift[0] = -1;
            for (int i = 0; i < len;) {
                while (j > -1 && this.pattern[i] != this.pattern[j])
                    j = shift[j];
                i++;
                j++;
                shift[i] = ((i < len && this.pattern[i] == this.pattern[j]) || (i == j))
                        ? shift[j]
                        : j;
            }
        }

    }

    /**
     * The Occurrence class abstracts the notion of an occurrence
     * of a pattern string within the source string, which might later be
     * replaced by the indicated pattern.
     */
    private static final class Occurrence {

        /**
         * Start of this occurrence within the source string
         */
        public int begin;

        /**
         * End of this occurrence within the source string
         */
        public int end;

        /**
         * The pattern associated with this occurrence. If during occurrence
         * processing an occurrence becomes invalid, this field will be set
         * to null.
         */
        public Pattern pattern;

        /**
         * Create a new occurrence.
         *
         * @param begin The starting point of the occurrence in the string
         * @param end The ending point (last character) of the occurrence
         * @param pattern The pattern applying for this occurrence
         */
        public Occurrence(int begin, int end, Pattern pattern) {
            this.begin = begin;
            this.end = end;
            this.pattern = pattern;
        }
    }

}