All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.oro.text.awk.AwkMatcher Maven / Gradle / Ivy

/*
 * $Id: AwkMatcher.java,v 1.11 2003/11/07 20:16:24 dfs Exp $
 *
 * ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
 *    must not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact [email protected].
 *
 * 5. Products derived from this software may not be called "Apache" 
 *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
 *    name, without prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * .
 */


package org.apache.oro.text.awk;

import java.io.*;

import org.apache.oro.text.regex.*;

/**
 * The AwkMatcher class is used to match regular expressions
 * (conforming to the Awk regular expression syntax) generated by
 * AwkCompiler.  AwkMatcher only supports 8-bit ASCII.  Any attempt
 * to match Unicode values greater than 255 will result in undefined
 * behavior.  AwkMatcher finds true leftmost-longest matches, so
 * you must take care with how you formulate your regular expression
 * to avoid matching more than you really want.
 * 

* It is important for you to remember that AwkMatcher does not save * parenthesized sub-group information. Therefore the number of groups * saved in a MatchResult produced by AwkMatcher will always be 1. * * @version @version@ * @since 1.0 * @see org.apache.oro.text.regex.PatternMatcher * @see AwkCompiler */ public final class AwkMatcher implements PatternMatcher { private int __lastMatchedBufferOffset; private AwkMatchResult __lastMatchResult = null; private AwkStreamInput __scratchBuffer, __streamSearchBuffer; private AwkPattern __awkPattern; private int __offsets[] = new int[2]; /** * A kluge variable to make PatternMatcherInput matches work when * their begin offset is non-zero. This kluge is caused by the * misguided notion that AwkStreamInput could be overloaded to do * both stream and fixed buffer matches. The whole input representation * scheme has to be scrapped and redone. -- dfs 2001/07/10 */ private int __beginOffset; public AwkMatcher() { __scratchBuffer = new AwkStreamInput(); __scratchBuffer._endOfStreamReached = true; } /** * Determines if a prefix of a string (represented as a char[]) * matches a given pattern, starting from a given offset into the string. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The char[] to test for a prefix match. * @param pattern The Pattern to be matched. * @param offset The offset at which to start searching for the prefix. * @return True if input matches pattern, false otherwise. */ // I reimplemented this method in terms of streammatchesPrefix // to reduce the code size. This is not very elegant and // reduces performance by a small degree. public boolean matchesPrefix(char[] input, Pattern pattern, int offset){ int result = -1; __awkPattern = (AwkPattern)pattern; __scratchBuffer._buffer = input; __scratchBuffer._bufferSize = input.length; __scratchBuffer._bufferOffset = __beginOffset = 0; __scratchBuffer._endOfStreamReached = true; __streamSearchBuffer = __scratchBuffer; __offsets[0] = offset; try { result = __streamMatchPrefix(); } catch(IOException e){ // Don't do anything because we're not doing any I/O result = -1; } if(result < 0) { __lastMatchResult = null; return false; } __lastMatchResult = new AwkMatchResult(new String(input, 0, result), offset); return true; } /** * Determines if a prefix of a string (represented as a char[]) * matches a given pattern. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The char[] to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(char[] input, Pattern pattern){ return matchesPrefix(input, pattern, 0); } /** * Determines if a prefix of a string matches a given pattern. * If a prefix of the string matches the pattern, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The String to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(String input, Pattern pattern) { return matchesPrefix(input.toCharArray(), pattern, 0); } /** * Determines if a prefix of a PatternMatcherInput instance * matches a given pattern. If there is a match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. Unlike the * {@link #contains(PatternMatcherInput, Pattern)} * method, the current offset of the PatternMatcherInput argument * is not updated. You should remember that the region starting * from the begin offset of the PatternMatcherInput will be * tested for a prefix match. *

* This method is useful for certain common token identification tasks * that are made more difficult without this functionality. *

* @param input The PatternMatcherInput to test for a prefix match. * @param pattern The Pattern to be matched. * @return True if input matches pattern, false otherwise. */ public boolean matchesPrefix(PatternMatcherInput input, Pattern pattern){ int result = -1; __awkPattern = (AwkPattern)pattern; __scratchBuffer._buffer = input.getBuffer(); __scratchBuffer._bufferOffset = __beginOffset = input.getBeginOffset(); __offsets[0] = input.getCurrentOffset(); __scratchBuffer._bufferSize = input.length(); __scratchBuffer._endOfStreamReached = true; __streamSearchBuffer = __scratchBuffer; try { result = __streamMatchPrefix(); } catch(IOException e) { // Don't do anything because we're not doing any I/O result = -1; } if(result < 0) { __lastMatchResult = null; return false; } __lastMatchResult = new AwkMatchResult(new String(__scratchBuffer._buffer, __offsets[0], result), __offsets[0]); return true; } /** * Determines if a string (represented as a char[]) exactly * matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. The pattern must be * an AwkPattern instance, otherwise a ClassCastException will * be thrown. You are not required to, and indeed should NOT try to * (for performance reasons), catch a ClassCastException because it * will never be thrown as long as you use an AwkPattern as the pattern * parameter. *

* @param input The char[] to test for an exact match. * @param pattern The AwkPattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean matches(char[] input, Pattern pattern) { int result = -1; __awkPattern = (AwkPattern)pattern; __scratchBuffer._buffer = input; __scratchBuffer._bufferSize = input.length; __scratchBuffer._bufferOffset = __beginOffset = 0; __scratchBuffer._endOfStreamReached = true; __streamSearchBuffer = __scratchBuffer; __offsets[0] = 0; try { result = __streamMatchPrefix(); } catch(IOException e){ // Don't do anything because we're not doing any I/O result = -1; } if(result != input.length) { __lastMatchResult = null; return false; } __lastMatchResult = new AwkMatchResult(new String(input, 0, result), 0); return true; } /** * Determines if a string exactly matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. The pattern must be * a AwkPattern instance, otherwise a ClassCastException will * be thrown. You are not required to, and indeed should NOT try to * (for performance reasons), catch a ClassCastException because it * will never be thrown as long as you use an AwkPattern as the pattern * parameter. *

* @param input The String to test for an exact match. * @param pattern The AwkPattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean matches(String input, Pattern pattern){ return matches(input.toCharArray(), pattern); } /** * Determines if the contents of a PatternMatcherInput instance * exactly matches a given pattern. If * there is an exact match, a MatchResult instance * representing the match is made accesible via * {@link #getMatch()}. Unlike the * {@link #contains(PatternMatcherInput, Pattern)} * method, the current offset of the PatternMatcherInput argument * is not updated. You should remember that the region between * the begin and end offsets of the PatternMatcherInput will be * tested for an exact match. *

* The pattern must be an AwkPattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * an AwkPattern as the pattern parameter. *

* @param input The PatternMatcherInput to test for a match. * @param pattern The AwkPattern to be matched. * @return True if input matches pattern, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean matches(PatternMatcherInput input, Pattern pattern){ int result = -1; __awkPattern = (AwkPattern)pattern; __scratchBuffer._buffer = input.getBuffer(); __scratchBuffer._bufferSize = input.length(); __scratchBuffer._bufferOffset = __beginOffset = input.getBeginOffset(); __offsets[0] = input.getBeginOffset(); __scratchBuffer._endOfStreamReached = true; __streamSearchBuffer = __scratchBuffer; try { result = __streamMatchPrefix(); } catch(IOException e){ // Don't do anything because we're not doing any I/O result = -1; } if(result != __scratchBuffer._bufferSize) { __lastMatchResult = null; return false; } __lastMatchResult = new AwkMatchResult(new String(__scratchBuffer._buffer, __offsets[0], __scratchBuffer._bufferSize), __offsets[0]); return true; } /** * Determines if a string (represented as a char[]) contains a pattern. * If the pattern is * matched by some substring of the input, a MatchResult instance * representing the first such match is made acessible via * {@link #getMatch()}. If you want to access * subsequent matches you should either use a PatternMatcherInput object * or use the offset information in the MatchResult to create a substring * representing the remaining input. Using the MatchResult offset * information is the recommended method of obtaining the parts of the * string preceeding the match and following the match. *

* The pattern must be an AwkPattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * an AwkPattern as the pattern parameter. *

* @param input The char[] to test for a match. * @param pattern The AwkPattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean contains(char[] input, Pattern pattern) { __awkPattern = (AwkPattern)pattern; // Begin anchor requires match occur at beginning of input if(__awkPattern._hasBeginAnchor && !__awkPattern._fastMap[input[0]]){ __lastMatchResult = null; return false; } __scratchBuffer._buffer = input; __scratchBuffer._bufferSize = input.length; __scratchBuffer._bufferOffset = __beginOffset = 0; __scratchBuffer._endOfStreamReached = true; __streamSearchBuffer = __scratchBuffer; __lastMatchedBufferOffset = 0; try { _search(); } catch(IOException e) { // do nothing } return (__lastMatchResult != null); } /** * Determines if a string contains a pattern. If the pattern is * matched by some substring of the input, a MatchResult instance * representing the first such match is made acessible via * {@link #getMatch()}. If you want to access * subsequent matches you should either use a PatternMatcherInput object * or use the offset information in the MatchResult to create a substring * representing the remaining input. Using the MatchResult offset * information is the recommended method of obtaining the parts of the * string preceeding the match and following the match. *

* The pattern must be an AwkPattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * an AwkPattern as the pattern parameter. *

* @param input The String to test for a match. * @param pattern The AwkPattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean contains(String input, Pattern pattern){ return contains(input.toCharArray(), pattern); } /** * Determines if the contents of a PatternMatcherInput, starting from the * current offset of the input contains a pattern. * If a pattern match is found, a MatchResult * instance representing the first such match is made acessible via * {@link #getMatch()}. The current offset of the * PatternMatcherInput is set to the offset corresponding to the end * of the match, so that a subsequent call to this method will continue * searching where the last call left off. You should remember that the * region between the begin and end offsets of the PatternMatcherInput are * considered the input to be searched, and that the current offset * of the PatternMatcherInput reflects where a search will start from. * Matches extending beyond the end offset of the PatternMatcherInput * will not be matched. In other words, a match must occur entirely * between the begin and end offsets of the input. See * {@link org.apache.oro.text.regex.PatternMatcherInput PatternMatcherInput} * for more details. *

* As a side effect, if a match is found, the PatternMatcherInput match * offset information is updated. See the PatternMatcherInput * {@link org.apache.oro.text.regex.PatternMatcherInput#setMatchOffsets * setMatchOffsets(int, int)} method for more details. *

* The pattern must be an AwkPattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * an AwkPattern as the pattern parameter. *

* This method is usually used in a loop as follows: *

   * PatternMatcher matcher;
   * PatternCompiler compiler;
   * Pattern pattern;
   * PatternMatcherInput input;
   * MatchResult result;
   *
   * compiler = new AwkCompiler();
   * matcher  = new AwkMatcher();
   *
   * try {
   *   pattern = compiler.compile(somePatternString);
   * } catch(MalformedPatternException e) {
   *   System.err.println("Bad pattern.");
   *   System.err.println(e.getMessage());
   *   return;
   * }
   *
   * input   = new PatternMatcherInput(someStringInput);
   *
   * while(matcher.contains(input, pattern)) {
   *   result = matcher.getMatch();  
   *   // Perform whatever processing on the result you want.
   * }
   *
   * 
*

* @param input The PatternMatcherInput to test for a match. * @param pattern The Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean contains(PatternMatcherInput input, Pattern pattern) { __awkPattern = (AwkPattern)pattern; __scratchBuffer._buffer = input.getBuffer(); __scratchBuffer._bufferOffset = __beginOffset = input.getBeginOffset(); __lastMatchedBufferOffset = input.getCurrentOffset(); // Begin anchor requires match occur at beginning of input // No need to adjust current offset if no match found. if(__awkPattern._hasBeginAnchor) { if(__beginOffset != __lastMatchedBufferOffset || !__awkPattern._fastMap[__scratchBuffer._buffer[__beginOffset]]) { __lastMatchResult = null; return false; } } __scratchBuffer._bufferSize = input.length(); __scratchBuffer._endOfStreamReached = true; __streamSearchBuffer = __scratchBuffer; try { _search(); } catch(IOException e) { // do nothing } input.setCurrentOffset(__lastMatchedBufferOffset); if(__lastMatchResult == null) return false; input.setMatchOffsets(__lastMatchResult.beginOffset(0), __lastMatchResult.endOffset(0)); return true; } /** * Determines if the contents of an AwkStreamInput, starting from the * current offset of the input contains a pattern. * If a pattern match is found, a MatchResult * instance representing the first such match is made acessible via * {@link #getMatch()}. The current offset of the * input stream is advanced to the end offset corresponding to the end * of the match. Consequently a subsequent call to this method will continue * searching where the last call left off. * See {@link AwkStreamInput} for more details. *

* Note, patterns matching the null string do NOT match at end of input * stream. This is different from the behavior you get from the other * contains() methods. *

* The pattern must be an AwkPattern instance, otherwise a * ClassCastException will be thrown. You are not required to, and * indeed should NOT try to (for performance reasons), catch a * ClassCastException because it will never be thrown as long as you use * an AwkPattern as the pattern parameter. *

* This method is usually used in a loop as follows: *

   * PatternMatcher matcher;
   * PatternCompiler compiler;
   * Pattern pattern;
   * AwkStreamInput input;
   * MatchResult result;
   *
   * compiler = new AwkCompiler();
   * matcher  = new AwkMatcher();
   *
   * try {
   *   pattern = compiler.compile(somePatternString);
   * } catch(MalformedPatternException e) {
   *   System.err.println("Bad pattern.");
   *   System.err.println(e.getMessage());
   *   return;
   * }
   *
   * input   = new AwkStreamInput(
   *             new BufferedInputStream(new FileInputStream(someFileName)));
   *
   * while(matcher.contains(input, pattern)) {
   *   result = matcher.getMatch();  
   *   // Perform whatever processing on the result you want.
   * }
   *
   * 
*

* @param input The PatternStreamInput to test for a match. * @param pattern The Pattern to be matched. * @return True if the input contains a pattern match, false otherwise. * @exception ClassCastException If a Pattern instance other than an * AwkPattern is passed as the pattern parameter. */ public boolean contains(AwkStreamInput input, Pattern pattern) throws IOException { __awkPattern = (AwkPattern)pattern; // Begin anchor requires match occur at beginning of input if(__awkPattern._hasBeginAnchor) { // Do read here instead of in _search() so we can test first char if(input._bufferOffset == 0) { if(input.read() && !__awkPattern._fastMap[input._buffer[0]]) { __lastMatchResult = null; return false; } } else { __lastMatchResult = null; return false; } } __lastMatchedBufferOffset = input._currentOffset; __streamSearchBuffer = input; __beginOffset = 0; _search(); input._currentOffset = __lastMatchedBufferOffset; if(__lastMatchResult != null) { // Adjust match begin offset to be relative to beginning of stream. __lastMatchResult._incrementMatchBeginOffset(input._bufferOffset); return true; } return false; } private int __streamMatchPrefix() throws IOException { int token, current = AwkPattern._START_STATE, lastState; int offset, initialOffset, maxOffset; int lastMatchedOffset = -1; int[] tstateArray; offset = initialOffset = __offsets[0]; maxOffset = __streamSearchBuffer._bufferSize + __beginOffset; test: while(offset < maxOffset) { token = __streamSearchBuffer._buffer[offset++]; if(current < __awkPattern._numStates) { lastState = current; tstateArray = __awkPattern._getStateArray(current); current = tstateArray[token]; if(current == 0){ __awkPattern._createNewState(lastState, token, tstateArray); current = tstateArray[token]; } if(current == AwkPattern._INVALID_STATE){ break test; } else if(__awkPattern._endStates.get(current)){ lastMatchedOffset = offset; } if(offset == maxOffset){ offset = __streamSearchBuffer._reallocate(initialOffset) + __beginOffset; maxOffset = __streamSearchBuffer._bufferSize + __beginOffset; // If we're at the end of the stream, don't reset values if(offset != maxOffset){ if(lastMatchedOffset != -1) lastMatchedOffset-=initialOffset; initialOffset = 0; } } } else break; } __offsets[0] = initialOffset; __offsets[1] = lastMatchedOffset - 1; if(lastMatchedOffset == -1 && __awkPattern._matchesNullString) return 0; // End anchor requires match occur at end of input if(__awkPattern._hasEndAnchor && (!__streamSearchBuffer._endOfStreamReached || lastMatchedOffset < __streamSearchBuffer._bufferSize + __beginOffset)) return -1; return (lastMatchedOffset - initialOffset); } void _search() throws IOException { int position, tokensMatched; __lastMatchResult = null; while(true){ if(__lastMatchedBufferOffset >= __streamSearchBuffer._bufferSize + __beginOffset) { if(__streamSearchBuffer._endOfStreamReached){ // Get rid of reference now that it should no longer be used. __streamSearchBuffer = null; return; } else { if(!__streamSearchBuffer.read()) return; __lastMatchedBufferOffset = 0; } } for(position = __lastMatchedBufferOffset; position < __streamSearchBuffer._bufferSize + __beginOffset; position = __offsets[0] + 1) { __offsets[0] = position; if(__awkPattern._fastMap[__streamSearchBuffer._buffer[position]] && (tokensMatched = __streamMatchPrefix()) > -1) { __lastMatchResult = new AwkMatchResult( new String(__streamSearchBuffer._buffer, __offsets[0], tokensMatched), __offsets[0]); __lastMatchedBufferOffset = (tokensMatched > 0 ? __offsets[1] + 1 : __offsets[0] + 1); return; } else if(__awkPattern._matchesNullString) { __lastMatchResult = new AwkMatchResult(new String(), position); __lastMatchedBufferOffset = position + 1; return; } } __lastMatchedBufferOffset = position; } } /** * Fetches the last match found by a call to a matches() or contains() * method. *

* @return A MatchResult instance containing the pattern match found * by the last call to any one of the matches() or contains() * methods. If no match was found by the last call, returns * null. */ public MatchResult getMatch() { return __lastMatchResult; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy