All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.tests.analysis.MockTokenizer Maven / Gradle / Ivy

There is a newer version: 7.6.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.tests.analysis;

import com.carrotsearch.randomizedtesting.RandomizedContext;
import java.io.IOException;
import java.nio.CharBuffer;
import java.util.Random;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;

/**
 * Tokenizer for testing.
 *
 * 

This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD} * tokenizers. If you are writing a component such as a TokenFilter, it's a great idea to test it * wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior: * *

    *
  • An internal state-machine is used for checking consumer consistency. These checks can be * disabled with {@link #setEnableChecks(boolean)}. *
  • For convenience, optionally lowercases terms that it outputs. *
*/ public class MockTokenizer extends Tokenizer { /** Acts Similar to WhitespaceTokenizer */ public static final CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton( Operations.determinize( new RegExp("[^ \t\r\n]+").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)); /** * Acts Similar to KeywordTokenizer. TODO: Keyword returns an "empty" token for an empty reader... */ public static final CharacterRunAutomaton KEYWORD = new CharacterRunAutomaton( Operations.determinize( new RegExp(".*").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)); /** Acts like LetterTokenizer. */ // the ugly regex below is incomplete Unicode 5.2 [:Letter:] public static final CharacterRunAutomaton SIMPLE = new CharacterRunAutomaton( Operations.determinize( new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)); /** * Limit the default token length to a size that doesn't cause random analyzer failures on * unpredictable data like the enwiki data set. * *

This value defaults to {@code CharTokenizer.DEFAULT_MAX_WORD_LEN} (255). * * @see "https://issues.apache.org/jira/browse/LUCENE-10541" */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private final CharacterRunAutomaton runAutomaton; private final boolean lowerCase; private final int maxTokenLength; private int state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); int off = 0; // buffered state (previous codepoint and offset). we replay this once we // hit a reject state in case it's permissible as the start of a new term. int bufferedCodePoint = -1; // -1 indicates empty buffer int bufferedOff = -1; // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? // currently, we can only check that the lifecycle is correct if someone is reusing, // but not for "one-offs". private enum State { SETREADER, // consumer set a reader input either via ctor or via reset(Reader) RESET, // consumer has called reset() INCREMENT, // consumer is consuming, has called incrementToken() == true INCREMENT_FALSE, // consumer has called incrementToken() which returned false END, // consumer has called end() to perform end of stream operations CLOSE // consumer has called close() to release any resources }; private State streamState = State.CLOSE; private int lastOffset = 0; // only for checks private boolean enableChecks = true; // evil: but we don't change the behavior with this random, we only switch up how we read private final Random random = new Random(RandomizedContext.current().getRandom().nextLong()); public MockTokenizer( AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { super(factory); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = 0; this.maxTokenLength = maxTokenLength; } public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength); } public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) { this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); } /** * Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, * true)} */ public MockTokenizer() { this(WHITESPACE, true); } public MockTokenizer( AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase) { this(factory, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); } /** * Calls {@link #MockTokenizer(AttributeFactory,CharacterRunAutomaton,boolean) * MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */ public MockTokenizer(AttributeFactory factory) { this(factory, WHITESPACE, true); } // we allow some checks (e.g. state machine) to be turned off. // turning off checks just means we suppress exceptions from them private void fail(String message) { if (enableChecks) { throw new IllegalStateException(message); } } private void failAlways(String message) { throw new IllegalStateException(message); } @Override public final boolean incrementToken() throws IOException { if (streamState != State.RESET && streamState != State.INCREMENT) { fail("incrementToken() called while in wrong state: " + streamState); } clearAttributes(); for (; ; ) { int startOffset; int cp; if (bufferedCodePoint >= 0) { cp = bufferedCodePoint; startOffset = bufferedOff; bufferedCodePoint = -1; } else { startOffset = off; cp = readCodePoint(); } if (cp < 0) { break; } else if (isTokenChar(cp)) { char[] chars = new char[2]; int endOffset; do { int len = Character.toChars(normalize(cp), chars, 0); for (int i = 0; i < len; i++) { termAtt.append(chars[i]); } endOffset = off; if (termAtt.length() >= maxTokenLength) { break; } cp = readCodePoint(); } while (cp >= 0 && isTokenChar(cp)); if (termAtt.length() < maxTokenLength) { // buffer up, in case the "rejected" char can start a new word of its own bufferedCodePoint = cp; bufferedOff = endOffset; } else { // otherwise, it's because we hit term limit. bufferedCodePoint = -1; } int correctedStartOffset = correctOffset(startOffset); int correctedEndOffset = correctOffset(endOffset); if (correctedStartOffset < 0) { failAlways( "invalid start offset: " + correctedStartOffset + ", before correction: " + startOffset); } if (correctedEndOffset < 0) { failAlways( "invalid end offset: " + correctedEndOffset + ", before correction: " + endOffset); } if (correctedStartOffset < lastOffset) { failAlways( "start offset went backwards: " + correctedStartOffset + ", before correction: " + startOffset + ", lastOffset: " + lastOffset); } lastOffset = correctedStartOffset; if (correctedEndOffset < correctedStartOffset) { failAlways( "end offset: " + correctedEndOffset + " is before start offset: " + correctedStartOffset); } offsetAtt.setOffset(correctedStartOffset, correctedEndOffset); if (state == -1 || runAutomaton.isAccept(state)) { // either we hit a reject state (longest match), or end-of-text, but in an accept state streamState = State.INCREMENT; return true; } } } streamState = State.INCREMENT_FALSE; return false; } protected int readCodePoint() throws IOException { int ch = readChar(); if (ch < 0) { return ch; } else { if (Character.isLowSurrogate((char) ch)) { failAlways("unpaired low surrogate: " + Integer.toHexString(ch)); } off++; if (Character.isHighSurrogate((char) ch)) { int ch2 = readChar(); if (ch2 >= 0) { off++; if (!Character.isLowSurrogate((char) ch2)) { failAlways( "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2)); } return Character.toCodePoint((char) ch, (char) ch2); } else { failAlways("stream ends with unpaired high surrogate: " + Integer.toHexString(ch)); } } return ch; } } protected int readChar() throws IOException { switch (random.nextInt(10)) { case 0: { // read(char[]) char[] c = new char[1]; int ret = input.read(c); return ret < 0 ? ret : c[0]; } case 1: { // read(char[], int, int) char[] c = new char[2]; int ret = input.read(c, 1, 1); return ret < 0 ? ret : c[1]; } case 2: { // read(CharBuffer) char[] c = new char[1]; CharBuffer cb = CharBuffer.wrap(c); int ret = input.read(cb); return ret < 0 ? ret : c[0]; } default: // read() return input.read(); } } protected boolean isTokenChar(int c) { if (state < 0) { state = 0; } state = runAutomaton.step(state, c); if (state < 0) { return false; } else { return true; } } protected int normalize(int c) { return lowerCase ? Character.toLowerCase(c) : c; } @Override public void reset() throws IOException { try { super.reset(); state = 0; lastOffset = off = 0; bufferedCodePoint = -1; if (streamState == State.RESET) { fail("double reset()"); } } finally { streamState = State.RESET; } } @Override public void close() throws IOException { try { super.close(); // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close() // these tests should disable this check, by default we check the normal workflow. // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this if (!(streamState == State.END || streamState == State.CLOSE)) { fail("close() called in wrong state: " + streamState); } } finally { streamState = State.CLOSE; } } @Override protected void setReaderTestPoint() { try { if (streamState != State.CLOSE) { fail("setReader() called in wrong state: " + streamState); } } finally { streamState = State.SETREADER; } } @Override public void end() throws IOException { try { super.end(); int finalOffset = correctOffset(off); offsetAtt.setOffset(finalOffset, finalOffset); // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns // false. // these tests should disable this check (in general you should consume the entire stream) if (streamState != State.INCREMENT_FALSE) { fail("end() called in wrong state=" + streamState + "!"); } } finally { streamState = State.END; } } /** * Toggle consumer workflow checking: if your test consumes tokenstreams normally you should leave * this enabled. */ public void setEnableChecks(boolean enableChecks) { this.enableChecks = enableChecks; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy