org.apache.lucene.analysis.pattern.SimplePatternTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-common Show documentation
Additional Analyzers
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.pattern;

import java.io.IOException;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;

/**
 * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
 * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster.  The provided
 * regex should match valid token characters (not token separator characters, like {@code String.split}).  The matching is greedy:
 * the longest match at a given start point will be the next token.  Empty string tokens are never produced.
 *
 * @lucene.experimental
 */

// TODO: the matcher here is naive and does have N^2 adversarial cases that are unlikely to arise in practice, e.g. if the pattern is
// aaaaaaaaaab and the input is aaaaaaaaaaa, the work we do here is N^2 where N is the number of a's.  This is because on failing to match
// a token, we skip one character forward and try again.  A better approach would be to compile something like this regexp
// instead: .* | , because that automaton would not "forget" all the as it had already seen, and would be a single pass
// through the input.  I think this is the same thing as Aho/Corasick's algorithm (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
// But we cannot implement this (I think?) until/unless Lucene regexps support sub-group capture, so we could know
// which specific characters the pattern matched.  SynonymFilter has this same limitation.

public final class SimplePatternTokenizer extends Tokenizer {

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  private final CharacterRunAutomaton runDFA;

  // TODO: we could likely use a single rolling buffer instead of two separate char buffers here.  We could also use PushBackReader but I
  // suspect it's slowish:

  private char[] pendingChars = new char[8];
  private int pendingLimit;
  private int pendingUpto;
  private int offset;
  private int tokenUpto;
  private final char[] buffer = new char[1024];
  private int bufferLimit;
  private int bufferNextRead;

  /** See {@link RegExp} for the accepted syntax. */
  public SimplePatternTokenizer(String regexp) {
    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
  }

  /** Runs a pre-built automaton. */
  public SimplePatternTokenizer(Automaton dfa) {
    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
  }

  /** See {@link RegExp} for the accepted syntax. */
  public SimplePatternTokenizer(AttributeFactory factory, String regexp, int determinizeWorkLimit) {
    this(factory, new RegExp(regexp).toAutomaton());
  }

  /** Runs a pre-built automaton. */
  public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
    super(factory);

    // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
    // realizing this ctor is otherwise trappy
    if (dfa.isDeterministic() == false) {
      throw new IllegalArgumentException("please determinize the incoming automaton first");
    }

    runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
  }

  @Override
  public boolean incrementToken() throws IOException {

    clearAttributes();
    tokenUpto = 0;

    while (true) {

      int offsetStart = offset;

      // The runDFA operates in Unicode space, not UTF16 (java's char):

      int ch = nextCodePoint();
      if (ch == -1) {
        return false;
      }

      int state = runDFA.step(0, ch);

      if (state != -1) {
        // a token just possibly started; keep scanning to see if the token is accepted:
        int lastAcceptLength = -1;
        do {

          if (runDFA.isAccept(state)) {
            // record that the token matches here, but keep scanning in case a longer match also works (greedy):
            lastAcceptLength = tokenUpto;
          }

          ch = nextCodePoint();
          if (ch == -1) {
            break;
          }
          state = runDFA.step(state, ch);
        } while (state != -1);
        
        if (lastAcceptLength != -1) {
          // we found a token
          int extra = tokenUpto - lastAcceptLength;
          if (extra != 0) {
            pushBack(extra);
          }
          termAtt.setLength(lastAcceptLength);
          offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+lastAcceptLength));
          return true;
        } else if (ch == -1) {
          return false;
        } else {
          // false alarm: there was no token here; push back all but the first character we scanned
          pushBack(tokenUpto-1);
          tokenUpto = 0;
        }
      } else {
        tokenUpto = 0;
      }
    }
  }

  @Override
  public void end() throws IOException {
    super.end();
    final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
    offsetAtt.setOffset(ofs, ofs);
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    offset = 0;
    pendingUpto = 0;
    pendingLimit = 0;
    tokenUpto = 0;
    bufferNextRead = 0;
    bufferLimit = 0;
  }

  /** Pushes back the last {@code count} characters in current token's buffer. */
  private void pushBack(int count) {
    
    if (pendingLimit == 0) {
      if (bufferLimit != -1 && bufferNextRead >= count) {
        // optimize common case when the chars we are pushing back are still in the buffer
        bufferNextRead -= count;
      } else {
        if (count > pendingChars.length) {
          pendingChars = ArrayUtil.grow(pendingChars, count);
        }
        System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
        pendingLimit = count;
      }
    } else {
      // we are pushing back what is already in our pending buffer
      pendingUpto -= count;
      assert pendingUpto >= 0;
    }
    offset -= count;
  }

  private void appendToToken(char ch) {
    char[] buffer = termAtt.buffer();
    if (tokenUpto == buffer.length) {
      buffer = termAtt.resizeBuffer(tokenUpto + 1);
    }
    buffer[tokenUpto++] = ch;
  }

  private int nextCodeUnit() throws IOException {
    int result;
    if (pendingUpto < pendingLimit) {
      result = pendingChars[pendingUpto++];
      if (pendingUpto == pendingLimit) {
        // We used up the pending buffer
        pendingUpto = 0;
        pendingLimit = 0;
      }
      appendToToken((char) result);
      offset++;
    } else if (bufferLimit == -1) {
      return -1;
    } else {
      assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
      if (bufferNextRead == bufferLimit) {
        bufferLimit = input.read(buffer, 0, buffer.length);
        if (bufferLimit == -1) {
          return -1;
        }
        bufferNextRead = 0;
      }
      result = buffer[bufferNextRead++];
      offset++;
      appendToToken((char) result);
    }
    return result;
  }
  
  private int nextCodePoint() throws IOException {

    int ch = nextCodeUnit();
    if (ch == -1) {
      return ch;
    }
    if (Character.isHighSurrogate((char) ch)) {
      return Character.toCodePoint((char) ch, (char) nextCodeUnit());
    } else {
      return ch;
    }
  }
}