All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.caucho.quercus.lib.regexp.RegexpSet Maven / Gradle / Ivy

There is a newer version: 4.0.66
Show newest version
/*
 * Copyright (c) 1998-2012 Caucho Technology -- all rights reserved
 *
 * This file is part of Resin(R) Open Source
 *
 * Each copy or derived work must preserve the copyright notice and this
 * notice unmodified.
 *
 * Resin Open Source is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Resin Open Source is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
 * of NON-INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Resin Open Source; if not, write to the
 *
 *   Free Software Foundation, Inc.
 *   59 Temple Place, Suite 330
 *   Boston, MA 02111-1307  USA
 *
 * @author Scott Ferguson
 */

package com.caucho.quercus.lib.regexp;

import java.util.HashMap;

import com.caucho.util.*;

// XXX: non-ascii range not quite correct for unicode, and neither is
// PHP's /u unicode option
class RegexpSet {
  static final int BITSET_CHARS = 128;

  static RegexpSet SPACE = null;
  static RegexpSet WORD = null;
  static RegexpSet DIGIT = null;
  static RegexpSet DOT = null;

  // POSIX character classes
  static RegexpSet PALNUM = null;
  static RegexpSet PALPHA = null;
  static RegexpSet PASCII = null;
  static RegexpSet PBLANK = null;
  static RegexpSet PCNTRL = null;
  static RegexpSet PDIGIT = null;
  static RegexpSet PGRAPH = null;
  static RegexpSet PLOWER = null;
  static RegexpSet PPRINT = null;
  static RegexpSet PPUNCT = null;
  static RegexpSet PSPACE = null;
  static RegexpSet PUPPER = null;
  static RegexpSet PXDIGIT = null;

  static HashMap CLASS_MAP = null;

  boolean []_bitset = new boolean[BITSET_CHARS];
  IntSet _range;

  /**
   * Create a new RegexpSet
   */
  RegexpSet()
  {
    _range = new IntSet();
  }

  /**
   * Create a new RegexpSet
   */
  RegexpSet(RegexpSet old)
  {
    System.arraycopy(old._bitset, 0, _bitset, 0, _bitset.length);

    _range = (IntSet) old._range.clone();
  }

  /**
   * Ors two character sets.
   */
  void mergeOr(RegexpSet b)
  {
    for (int i = 0; i < BITSET_CHARS; i++)
      _bitset[i] = _bitset[i] || b._bitset[i];

    _range.union(b._range);
  }

  /**
   * Ors a set with the inverse of another.
   */
  void mergeOrInv(RegexpSet b)
  {
    for (int i = 0; i < BITSET_CHARS; i++)
      _bitset[i] = _bitset[i] || ! b._bitset[i];

    _range.unionNegate(b._range, 0, 0xfffff);
  }

  /**
   * Set a range of characters in a character set.
   */
  void setRange(int low, int high)
  {
    // php/4es0
    // http://bugs.caucho.com/view.php?id=3811
    if (low > high || low < 0)
      throw new RuntimeException("Range out of range (" + low + ", " + high + ")");

    if (low < BITSET_CHARS) {
      for (int i = low; i < Math.min(high + 1, BITSET_CHARS); i++)
        _bitset[i] = true;

      if (high < BITSET_CHARS)
        return;

      low = BITSET_CHARS;
    }

    _range.union(low, high);
  }

  /**
   * Calculate the intersection of two sets.
   *
   * @return true if disjoint
   */
  boolean mergeOverlap(RegexpSet next)
  {
    boolean isDisjoint = true;

    for (int i = 0; i < BITSET_CHARS; i++) {
      _bitset[i] = _bitset[i] & next._bitset[i];

      if (_bitset[i])
        isDisjoint = false;
    }

    if (_range.intersection(next._range))
      isDisjoint = false;

    return isDisjoint;
  }

  /**
   * Calculate the difference of two sets.
   *
   * @return true if disjoint
   */
  void difference(RegexpSet next)
  {
    for (int i = 0; i < BITSET_CHARS; i++) {
      _bitset[i] = _bitset[i] & ! next._bitset[i];
    }

    _range.difference(next._range);
  }

  /**
   *   Returns true if the character is in the set.
   */
  boolean match(int ch)
  {
    if (ch < 0)
      return false;
    else if (ch < BITSET_CHARS)
      return _bitset[ch];
    else {
      return _range.contains(ch);
    }
  }

  RegexpNode createNode()
  {
    if (_range.size() == 0)
      return new RegexpNode.AsciiSet(_bitset);
    else
      return new RegexpNode.Set(_bitset, _range);
  }

  RegexpNode createNotNode()
  {
    if (_range.size() == 0)
      return new RegexpNode.AsciiNotSet(_bitset);
    else
      return new RegexpNode.NotSet(_bitset, _range);
  }

  int getSize()
  {
    return _range.size();
  }

  static {
    SPACE = new RegexpSet();
    SPACE.setRange(' ', ' ');
    SPACE.setRange(0x09, 0x0A); //tab to newline
    SPACE.setRange(0x0C, 0x0D); //form feed to carriage return

    DOT = new RegexpSet();
    DOT.setRange('\n', '\n');

    DIGIT = new RegexpSet();
    DIGIT.setRange('0', '9');

    WORD = new RegexpSet();
    WORD.setRange('a', 'z');
    WORD.setRange('A', 'Z');
    WORD.setRange('0', '9');
    WORD.setRange('_', '_');

    PASCII = new RegexpSet();
    PASCII.setRange(0, 0x7F);
    PASCII.setRange(0x81, 0x87);
    PASCII.setRange(0x89, 0x97);
    PASCII.setRange(0x9A, 0xFF);

    PBLANK = new RegexpSet();
    PBLANK.setRange(' ', ' ');
    PBLANK.setRange('\t', '\t');
    PBLANK.setRange(0xA0, 0xA0);

    PCNTRL = new RegexpSet();
    PCNTRL.setRange(0, 0x1F);
    PCNTRL.setRange(0x7F, 0x7F);
    PCNTRL.setRange(0x81, 0x81);
    PCNTRL.setRange(0x8D, 0x8D);
    PCNTRL.setRange(0x8F, 0x90);
    PCNTRL.setRange(0x9D, 0x9D);

    PDIGIT = new RegexpSet();
    PDIGIT.setRange('0', '9');
    PDIGIT.setRange(0xB2, 0xB3);
    PDIGIT.setRange(0xB9, 0xB9);

    PLOWER = new RegexpSet();
    PLOWER.setRange('a', 'z');
    PLOWER.setRange(0x83, 0x83);
    PLOWER.setRange(0x9A, 0x9A);
    PLOWER.setRange(0x9C, 0x9C);
    PLOWER.setRange(0x9E, 0x9E);
    PLOWER.setRange(0xAA, 0xAA);
    PLOWER.setRange(0xB5, 0xB5);
    PLOWER.setRange(0xBA, 0xBA);
    PLOWER.setRange(0xDF, 0xF6);
    PLOWER.setRange(0xF8, 0xFF);

    PSPACE = new RegexpSet();
    PSPACE.setRange(' ', ' ');
    PSPACE.setRange(0x09, 0x0D);
    PSPACE.setRange(0xA0, 0xA0);

    PUPPER = new RegexpSet();
    PUPPER.setRange('A', 'Z');
    PUPPER.setRange(0x8A, 0x8A);
    PUPPER.setRange(0x8C, 0x8C);
    PUPPER.setRange(0x8E, 0x8E);
    PUPPER.setRange(0x9F, 0x9F);
    PUPPER.setRange(0xC0, 0xD6);
    PUPPER.setRange(0xD8, 0xDE);

    PXDIGIT = new RegexpSet();
    PXDIGIT.setRange('0', '9');
    PXDIGIT.setRange('A', 'F');
    PXDIGIT.setRange('a', 'f');

    PALPHA = new RegexpSet();
    PALPHA.mergeOr(PLOWER);
    PALPHA.mergeOr(PUPPER);

    PALNUM = new RegexpSet();
    PALNUM.mergeOr(PALPHA);
    PALNUM.mergeOr(PDIGIT);

    PPUNCT = new RegexpSet();
    PPUNCT.setRange(0x21, 0x2F);
    PPUNCT.setRange(0x3A, 0x40);
    PPUNCT.setRange(0x5B, 0x60);
    PPUNCT.setRange(0x7B, 0x7E);
    PPUNCT.setRange(0x82, 0x82);
    PPUNCT.setRange(0x84, 0x87);
    PPUNCT.setRange(0x89, 0x89);
    PPUNCT.setRange(0x8B, 0x8B);
    PPUNCT.setRange(0x91, 0x97);
    PPUNCT.setRange(0x9B, 0x9B);
    PPUNCT.setRange(0xA1, 0xBF);
    PPUNCT.setRange(0xD7, 0xD7);
    PPUNCT.setRange(0xF7, 0xF7);

    PGRAPH = new RegexpSet();
    PGRAPH.mergeOr(PALNUM);
    PGRAPH.mergeOr(PPUNCT);

    PPRINT = new RegexpSet();
    PPRINT.mergeOr(PGRAPH);
    PPRINT.setRange(' ', ' ');
    PPRINT.setRange(0x09, 0x09);
    PPRINT.setRange(0xA0, 0xA0);

    CLASS_MAP = new HashMap();
    CLASS_MAP.put("alnum", PALNUM); //php/4ek0
    CLASS_MAP.put("alpha", PALPHA); //php/4ek1
    CLASS_MAP.put("ascii", PASCII); //php/4ek2
    CLASS_MAP.put("blank", PBLANK); //php/4ek3
    CLASS_MAP.put("cntrl", PCNTRL); //php/4ek4
    CLASS_MAP.put("digit", PDIGIT); //php/4ek5
    CLASS_MAP.put("graph", PGRAPH); //php/4ek6
    CLASS_MAP.put("lower", PLOWER); //php/4ek7
    CLASS_MAP.put("print", PPRINT); //php/4ek8
    CLASS_MAP.put("punct", PPUNCT); //php/4ek9
    CLASS_MAP.put("space", PSPACE); //php/4eka
    CLASS_MAP.put("upper", PUPPER); //php/4ekb
    CLASS_MAP.put("xdigit", PXDIGIT); //php/4ekc
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy