All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.xerces.impl.xpath.regex.REUtil Maven / Gradle / Ivy

Go to download

A processor for parsing, validating, serializing and manipulating XML, written in Java

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.xerces.impl.xpath.regex;

import java.text.CharacterIterator;

/**
 * @xerces.internal
 * 
 * @version $Id$
 */
public final class REUtil {
    private REUtil() {
    }

    static final int composeFromSurrogates(int high, int low) {
        return 0x10000 + ((high-0xd800)<<10) + low-0xdc00;
    }

    static final boolean isLowSurrogate(int ch) {
        return (ch & 0xfc00) == 0xdc00;
    }

    static final boolean isHighSurrogate(int ch) {
        return (ch & 0xfc00) == 0xd800;
    }

    static final String decomposeToSurrogates(int ch) {
        char[] chs = new char[2];
        ch -= 0x10000;
        chs[0] = (char)((ch>>10)+0xd800);
        chs[1] = (char)((ch&0x3ff)+0xdc00);
        return new String(chs);
    }

    static final String substring(CharacterIterator iterator, int begin, int end) {
        char[] src = new char[end-begin];
        for (int i = 0;  i < src.length;  i ++)
            src[i] = iterator.setIndex(i+begin);
        return new String(src);
    }

    // ================================================================

    static final int getOptionValue(int ch) {
        int ret = 0;
        switch (ch) {
          case 'i':
            ret = RegularExpression.IGNORE_CASE;
            break;
          case 'm':
            ret = RegularExpression.MULTIPLE_LINES;
            break;
          case 's':
            ret = RegularExpression.SINGLE_LINE;
            break;
          case 'x':
            ret = RegularExpression.EXTENDED_COMMENT;
            break;
          case 'u':
            ret = RegularExpression.USE_UNICODE_CATEGORY;
            break;
          case 'w':
            ret = RegularExpression.UNICODE_WORD_BOUNDARY;
            break;
          case 'F':
            ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION;
            break;
          case 'H':
            ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
            break;
          case 'X':
            ret = RegularExpression.XMLSCHEMA_MODE;
            break;
          case ',':
            ret = RegularExpression.SPECIAL_COMMA;
            break;
          case 'b':
            ret = RegularExpression.ALLOW_UNRECOGNIZED_BLOCK_NAME;
            break;
          case 'h':
            ret = RegularExpression.HYPHEN_IN_SCHEMA_11;
          default:
        }
        return ret;
    }

    static final int parseOptions(String opts) throws ParseException {
        if (opts == null)  return 0;
        int options = 0;
        for (int i = 0;  i < opts.length();  i ++) {
            int v = getOptionValue(opts.charAt(i));
            if (v == 0)
                throw new ParseException("Unknown Option: "+opts.substring(i), -1);
            options |= v;
        }
        return options;
    }

    static final String createOptionString(int options) {
        StringBuffer sb = new StringBuffer(9);
        if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0)
            sb.append((char)'F');
        if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0)
            sb.append((char)'H');
        if ((options & RegularExpression.XMLSCHEMA_MODE) != 0)
            sb.append((char)'X');
        if ((options & RegularExpression.IGNORE_CASE) != 0)
            sb.append((char)'i');
        if ((options & RegularExpression.MULTIPLE_LINES) != 0)
            sb.append((char)'m');
        if ((options & RegularExpression.SINGLE_LINE) != 0)
            sb.append((char)'s');
        if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0)
            sb.append((char)'u');
        if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0)
            sb.append((char)'w');
        if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
            sb.append((char)'x');
        if ((options & RegularExpression.SPECIAL_COMMA) != 0)
            sb.append((char)',');
        return sb.toString().intern();
    }

    // ================================================================

    static String stripExtendedComment(String regex) {
        int len = regex.length();
        StringBuffer buffer = new StringBuffer(len);
        int offset = 0;
        int charClass = 0;
        while (offset < len) {
            int ch = regex.charAt(offset++);
                                                // Skips a white space.
            if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ') {
                // if we are inside a character class, we keep the white space
                if (charClass > 0) {
                    buffer.append((char)ch);
                }
                continue;
            }

            if (ch == '#') {                    // Skips chracters between '#' and a line end.
                while (offset < len) {
                    ch = regex.charAt(offset++);
                    if (ch == '\r' || ch == '\n')
                        break;
                }
                continue;
            }

            int next;                           // Strips an escaped white space.
            if (ch == '\\' && offset < len) {
                if ((next = regex.charAt(offset)) == '#'
                    || next == '\t' || next == '\n' || next == '\f'
                    || next == '\r' || next == ' ') {
                    buffer.append((char)next);
                    offset ++;
                } else {                        // Other escaped character.
                    buffer.append((char)'\\');
                    buffer.append((char)next);
                    offset ++;
                }
            }
            else if (ch == '[') {
                charClass++;
                buffer.append((char)ch);
                if (offset < len) {
                    next = regex.charAt(offset);
                    if (next == '[' || next ==']') {
                        buffer.append((char)next);
                        offset ++;
                    }
                    else if (next == '^' && offset + 1 < len) {
                        next = regex.charAt(offset + 1);
                        if (next == '[' || next ==']') {
                            buffer.append((char)'^');
                            buffer.append((char)next);
                            offset += 2;
                        }
                    }
                }
            }
            else {
                if (charClass > 0 && ch == ']') {
                    --charClass;
                }
                buffer.append((char)ch);
            }
        }
        return buffer.toString();
    }

    // ================================================================

    /**
     * Sample entry.
     * 
Usage: org.apache.xerces.utils.regex.REUtil <regex> <string>
*/ public static void main(String[] argv) { String pattern = null; try { String options = ""; String target = null; if( argv.length == 0 ) { System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" ); System.exit( 0 ); } for (int i = 0; i < argv.length; i ++) { if (argv[i].length() == 0 || argv[i].charAt(0) != '-') { if (pattern == null) pattern = argv[i]; else if (target == null) target = argv[i]; else System.err.println("Unnecessary: "+argv[i]); } else if (argv[i].equals("-i")) { options += "i"; } else if (argv[i].equals("-m")) { options += "m"; } else if (argv[i].equals("-s")) { options += "s"; } else if (argv[i].equals("-u")) { options += "u"; } else if (argv[i].equals("-w")) { options += "w"; } else if (argv[i].equals("-X")) { options += "X"; } else { System.err.println("Unknown option: "+argv[i]); } } RegularExpression reg = new RegularExpression(pattern, options); System.out.println("RegularExpression: "+reg); Match match = new Match(); reg.matches(target, match); for (int i = 0; i < match.getNumberOfGroups(); i ++) { if (i == 0 ) System.out.print("Matched range for the whole pattern: "); else System.out.print("["+i+"]: "); if (match.getBeginning(i) < 0) System.out.println("-1"); else { System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", "); System.out.println("\""+match.getCapturedText(i)+"\""); } } } catch (ParseException pe) { if (pattern == null) { pe.printStackTrace(); } else { System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage()); String indent = " "; System.err.println(indent+pattern); int loc = pe.getLocation(); if (loc >= 0) { System.err.print(indent); for (int i = 0; i < loc; i ++) System.err.print("-"); System.err.println("^"); } } } catch (Exception e) { e.printStackTrace(); } } static final int CACHESIZE = 20; static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE]; /** * Creates a RegularExpression instance. * This method caches created instances. * * @see RegularExpression#RegularExpression(java.lang.String, java.lang.String) */ public static RegularExpression createRegex(String pattern, String options) throws ParseException { RegularExpression re = null; int intOptions = REUtil.parseOptions(options); synchronized (REUtil.regexCache) { int i; for (i = 0; i < REUtil.CACHESIZE; i ++) { RegularExpression cached = REUtil.regexCache[i]; if (cached == null) { i = -1; break; } if (cached.equals(pattern, intOptions)) { re = cached; break; } } if (re != null) { if (i != 0) { System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i); REUtil.regexCache[0] = re; } } else { re = new RegularExpression(pattern, options); System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1); REUtil.regexCache[0] = re; } } return re; } /** * * @see RegularExpression#matches(java.lang.String) */ public static boolean matches(String regex, String target) throws ParseException { return REUtil.createRegex(regex, null).matches(target); } /** * * @see RegularExpression#matches(java.lang.String) */ public static boolean matches(String regex, String options, String target) throws ParseException { return REUtil.createRegex(regex, options).matches(target); } // ================================================================ /** * */ public static String quoteMeta(String literal) { int len = literal.length(); StringBuffer buffer = null; for (int i = 0; i < len; i ++) { int ch = literal.charAt(i); if (".*+?{[()|\\^$".indexOf(ch) >= 0) { if (buffer == null) { buffer = new StringBuffer(i+(len-i)*2); if (i > 0) buffer.append(literal.substring(0, i)); } buffer.append((char)'\\'); buffer.append((char)ch); } else if (buffer != null) buffer.append((char)ch); } return buffer != null ? buffer.toString() : literal; } // ================================================================ static void dumpString(String v) { for (int i = 0; i < v.length(); i ++) { System.out.print(Integer.toHexString(v.charAt(i))); System.out.print(" "); } System.out.println(); } static void setupRange(Token range, String src) { int len = src.length(); for (int i = 0; i < len; i += 2) range.addRange(src.charAt(i), src.charAt(i+1)); } static void setupRange(Token range, int[] src) { int len = src.length; for (int i = 0; i < len; i += 2) range.addRange(src[i], src[i+1]); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy