org.apache.lucene.analysis.BaseTokenStreamTestCase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-test-framework Show documentation
Apache Lucene (module: test-framework)
There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.CountDownLatch;

import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;

/** 
 * Base class for all Lucene unit tests that use TokenStreams. 
 * 
 * When writing unit tests for analysis components, it's highly recommended
 * to use the helper methods here (especially in conjunction with {@link MockAnalyzer} or
 * {@link MockTokenizer}), as they contain many assertions and checks to 
 * catch bugs.
 * 
 * @see MockAnalyzer
 * @see MockTokenizer
 */
public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  // some helpers to test Analyzers and TokenStreams:
  
  /**
   * Attribute that records if it was cleared or not.  This is used 
   * for testing that clearAttributes() was called correctly.
   */
  public static interface CheckClearAttributesAttribute extends Attribute {
    boolean getAndResetClearCalled();
  }

  /**
   * Attribute that records if it was cleared or not.  This is used 
   * for testing that clearAttributes() was called correctly.
   */
  public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute {
    private boolean clearCalled = false;
    
    @Override
    public boolean getAndResetClearCalled() {
      try {
        return clearCalled;
      } finally {
        clearCalled = false;
      }
    }

    @Override
    public void clear() {
      clearCalled = true;
    }

    @Override
    public boolean equals(Object other) {
      return (
        other instanceof CheckClearAttributesAttributeImpl &&
        ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled
      );
    }

    @Override
    public int hashCode() {
      return 76137213 ^ Boolean.valueOf(clearCalled).hashCode();
    }
    
    @Override
    public void copyTo(AttributeImpl target) {
      ((CheckClearAttributesAttributeImpl) target).clear();
    }

    @Override
    public void reflectWith(AttributeReflector reflector) {
      reflector.reflect(CheckClearAttributesAttribute.class, "clearCalled", clearCalled);
    }
  }

  // offsetsAreCorrect also validates:
  //   - graph offsets are correct (all tokens leaving from
  //     pos X have the same startOffset; all tokens
  //     arriving to pos Y have the same endOffset)
  //   - offsets only move forwards (startOffset >=
  //     lastStartOffset)
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
                                               int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
                                               boolean offsetsAreCorrect) throws IOException {
    assertNotNull(output);
    CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
    
    CharTermAttribute termAtt = null;
    if (output.length > 0) {
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      termAtt = ts.getAttribute(CharTermAttribute.class);
    }
    
    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
      assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
      offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }
    
    TypeAttribute typeAtt = null;
    if (types != null) {
      assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
      typeAtt = ts.getAttribute(TypeAttribute.class);
    }
    
    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
      assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
      posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
      assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
      assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
      keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }
    
    // Maps position to the start/end offset:
    final Map posToStartOffset = new HashMap<>();
    final Map posToEndOffset = new HashMap<>();

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
      // extra safety to enforce, that the state is not preserved and also assign bogus values
      ts.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
      if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
      if (typeAtt != null) typeAtt.setType("bogusType");
      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
      if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
      if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
      
      checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
      assertTrue("token "+i+" does not exist", ts.incrementToken());
      assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
      
      assertEquals("term "+i, output[i], termAtt.toString());
      if (startOffsets != null) {
        assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset());
      }
      if (endOffsets != null) {
        assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset());
      }
      if (types != null) {
        assertEquals("type "+i, types[i], typeAtt.type());
      }
      if (posIncrements != null) {
        assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
      }
      if (posLengths != null) {
        assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
      }
      if (keywordAtts != null) {
        assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
      }
      
      // we can enforce some basic things about a few attributes even if the caller doesn't check:
      if (offsetAtt != null) {
        final int startOffset = offsetAtt.startOffset();
        final int endOffset = offsetAtt.endOffset();
        if (finalOffset != null) {
          assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
          assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(),
                     endOffset <= finalOffset.intValue());
        }

        if (offsetsAreCorrect) {
          assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
          lastStartOffset = offsetAtt.startOffset();
        }

        if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
          // Validate offset consistency in the graph, ie
          // all tokens leaving from a certain pos have the
          // same startOffset, and all tokens arriving to a
          // certain pos have the same endOffset:
          final int posInc = posIncrAtt.getPositionIncrement();
          pos += posInc;

          final int posLength = posLengthAtt.getPositionLength();

          if (!posToStartOffset.containsKey(pos)) {
            // First time we've seen a token leaving from this position:
            posToStartOffset.put(pos, startOffset);
            //System.out.println("  + s " + pos + " -> " + startOffset);
          } else {
            // We've seen a token leaving from this position
            // before; verify the startOffset is the same:
            //System.out.println("  + vs " + pos + " -> " + startOffset);
            assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
          }

          final int endPos = pos + posLength;

          if (!posToEndOffset.containsKey(endPos)) {
            // First time we've seen a token arriving to this position:
            posToEndOffset.put(endPos, endOffset);
            //System.out.println("  + e " + endPos + " -> " + endOffset);
          } else {
            // We've seen a token arriving to this position
            // before; verify the endOffset is the same:
            //System.out.println("  + ve " + endPos + " -> " + endOffset);
            assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
          }
        }
      }
      if (posIncrAtt != null) {
        if (i == 0) {
          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
        } else {
          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
        }
      }
      if (posLengthAtt != null) {
        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
      }
    }

    if (ts.incrementToken()) {
      fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt);
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null) termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
    if (typeAtt != null) typeAtt.setType("bogusType");
    if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
    
    checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before

    ts.end();
    assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled());
    
    if (finalOffset != null) {
      assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
      assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
      assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
                                               int posLengths[], Integer finalOffset, boolean[] keywordAtts,
                                               boolean offsetsAreCorrect) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
    assertTokenStreamContents(ts, output, null, null, types, null, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
  }
  
  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
    checkResetException(a, input);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
    checkResetException(a, input);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
    checkResetException(a, input);
    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, null, null);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, types, null, null);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
  }

  public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
  }
  
  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
  }

  static void checkResetException(Analyzer a, String input) throws IOException {
    TokenStream ts = a.tokenStream("bogus", input);
    try {
      if (ts.incrementToken()) {
        //System.out.println(ts.reflectAsString(false));
        fail("didn't get expected exception when reset() not called");
      }
    } catch (IllegalStateException expected) {
      // ok
    } catch (Exception unexpected) {
      unexpected.printStackTrace(System.err);
      fail("got wrong exception when reset() not called: " + unexpected);
    } finally {
      // consume correctly
      ts.reset();
      while (ts.incrementToken()) {}
      ts.end();
      ts.close();
    }
    
    // check for a missing close()
    ts = a.tokenStream("bogus", input);
    ts.reset();
    while (ts.incrementToken()) {}
    ts.end();
    try {
      ts = a.tokenStream("bogus", input);
      fail("didn't get expected exception when close() not called");
    } catch (IllegalStateException expected) {
      // ok
    } finally {
      ts.close();
    }
  }

  // simple utility method for testing stemmers
  
  public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException {
    assertAnalyzesTo(a, input, new String[]{expected});
  }
  
  /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
  public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
    checkRandomData(random, a, iterations, 20, false, true);
  }

  /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, false, true);
  }
  
  /** 
   * utility method for blasting tokenstreams with data to make sure they don't do anything crazy 
   * @param simple true if only ascii strings will be used (try to avoid)
   */
  public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
    checkRandomData(random, a, iterations, 20, simple, true);
  }
  
  /** Asserts that the given stream has expected number of tokens. */
  public static void assertStreamHasNumberOfTokens(TokenStream ts, int expectedCount) throws IOException {
    ts.reset();
    int count = 0;
    while (ts.incrementToken()) {
      count++;
    }
    ts.end();
    assertEquals("wrong number of tokens", expectedCount, count);
  }
  
  static class AnalysisThread extends Thread {
    final int iterations;
    final int maxWordLength;
    final long seed;
    final Analyzer a;
    final boolean useCharFilter;
    final boolean simple;
    final boolean offsetsAreCorrect;
    final RandomIndexWriter iw;
    final CountDownLatch latch;

    // NOTE: not volatile because we don't want the tests to
    // add memory barriers (ie alter how threads
    // interact)... so this is just "best effort":
    public boolean failed;
    
    AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
      this.seed = seed;
      this.a = a;
      this.iterations = iterations;
      this.maxWordLength = maxWordLength;
      this.useCharFilter = useCharFilter;
      this.simple = simple;
      this.offsetsAreCorrect = offsetsAreCorrect;
      this.iw = iw;
      this.latch = latch;
    }
    
    @Override
    public void run() {
      boolean success = false;
      try {
        latch.await();
        // see the part in checkRandomData where it replays the same text again
        // to verify reproducability/reuse: hopefully this would catch thread hazards.
        checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
        success = true;
      } catch (Exception e) {
        Rethrow.rethrow(e);
      } finally {
        failed = !success;
      }
    }
  };
  
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, simple, true);
  }

  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
    checkResetException(a, "best effort");
    long seed = random.nextLong();
    boolean useCharFilter = random.nextBoolean();
    Directory dir = null;
    RandomIndexWriter iw = null;
    final String postingsFormat =  TestUtil.getPostingsFormat("dummy");
    boolean codecOk = iterations * maxWordLength < 100000 ||
        !(postingsFormat.equals("Memory") ||
            postingsFormat.equals("SimpleText"));
    if (rarely(random) && codecOk) {
      dir = newFSDirectory(createTempDir("bttc"));
      iw = new RandomIndexWriter(new Random(seed), dir, a);
    }
    boolean success = false;
    try {
      checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
      // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
      // so this should only really fail from another thread if it's an actual thread problem
      int numThreads = TestUtil.nextInt(random, 2, 4);
      final CountDownLatch startingGun = new CountDownLatch(1);
      AnalysisThread threads[] = new AnalysisThread[numThreads];
      for (int i = 0; i < threads.length; i++) {
        threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
      }
      for (int i = 0; i < threads.length; i++) {
        threads[i].start();
      }
      startingGun.countDown();
      for (int i = 0; i < threads.length; i++) {
        try {
          threads[i].join();
        } catch (InterruptedException e) {
          throw new RuntimeException(e);
        }
      }
      for (int i = 0; i < threads.length; i++) {
        if (threads[i].failed) {
          throw new RuntimeException("some thread(s) failed");
        }
      }
      if (iw != null) {
        iw.close();
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(dir);
      } else {
        IOUtils.closeWhileHandlingException(dir); // checkindex
      }
    }
  }

  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {

    final LineFileDocs docs = new LineFileDocs(random);
    Document doc = null;
    Field field = null, currentField = null;
    StringReader bogus = new StringReader("");
    if (iw != null) {
      doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      if (random.nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random.nextBoolean());
        ft.setStoreTermVectorPositions(random.nextBoolean());
        if (ft.storeTermVectorPositions()) {
          ft.setStoreTermVectorPayloads(random.nextBoolean());
        }
      }
      if (random.nextBoolean()) {
        ft.setOmitNorms(true);
      }
      switch(random.nextInt(4)) {
        case 0: ft.setIndexOptions(IndexOptions.DOCS); break;
        case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
        case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
        default:
          if (offsetsAreCorrect) {
            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
          } else {
            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
          }
      }
      currentField = field = new Field("dummy", bogus, ft);
      doc.add(currentField);
    }
    
    try {
      for (int i = 0; i < iterations; i++) {
        String text;
        
        if (random.nextInt(10) == 7) {
          // real data from linedocs
          text = docs.nextDoc().get("body");
          if (text.length() > maxWordLength) {
            
            // Take a random slice from the text...:
            int startPos = random.nextInt(text.length() - maxWordLength);
            if (startPos > 0 && Character.isLowSurrogate(text.charAt(startPos))) {
              // Take care not to split up a surrogate pair:
              startPos--;
              assert Character.isHighSurrogate(text.charAt(startPos));
            }
            int endPos = startPos + maxWordLength - 1;
            if (Character.isHighSurrogate(text.charAt(endPos))) {
              // Take care not to split up a surrogate pair:
              endPos--;
            }
            text = text.substring(startPos, 1+endPos);
          }
        } else {
          // synthetic
          text = TestUtil.randomAnalysisString(random, maxWordLength, simple);
        }
        
        try {
          checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
          if (iw != null) {
            if (random.nextInt(7) == 0) {
              // pile up a multivalued field
              FieldType ft = field.fieldType();
              currentField = new Field("dummy", bogus, ft);
              doc.add(currentField);
            } else {
              iw.addDocument(doc);
              if (doc.getFields().size() > 1) {
                // back to 1 field
                currentField = field;
                doc.removeFields("dummy");
                doc.add(currentField);
              }
            }
          }
        } catch (Throwable t) {
          // TODO: really we should pass a random seed to
          // checkAnalysisConsistency then print it here too:
          System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'");
          Rethrow.rethrow(t);
        }
      }
    } finally {
      IOUtils.closeWhileHandlingException(docs);
    }
  }

  public static String escape(String s) {
    int charUpto = 0;
    final StringBuilder sb = new StringBuilder();
    while (charUpto < s.length()) {
      final int c = s.charAt(charUpto);
      if (c == 0xa) {
        // Strangely, you cannot put \ u000A into Java
        // sources (not in a comment nor a string
        // constant)...:
        sb.append("\\n");
      } else if (c == 0xd) {
        // ... nor \ u000D:
        sb.append("\\r");
      } else if (c == '"') {
        sb.append("\\\"");
      } else if (c == '\\') {
        sb.append("\\\\");
      } else if (c >= 0x20 && c < 0x80) {
        sb.append((char) c);
      } else {
        // TODO: we can make ascii easier to read if we
        // don't escape...
        sb.append(String.format(Locale.ROOT, "\\u%04x", c));
      }
      charUpto++;
    }
    return sb.toString();
  }

  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
    checkAnalysisConsistency(random, a, useCharFilter, text, true);
  }

  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
    checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
  }
  
  private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {

    if (VERBOSE) {
      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
    }

    int remainder = random.nextInt(10);
    Reader reader = new StringReader(text);
    TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
    List tokens = new ArrayList<>();
    List types = new ArrayList<>();
    List positions = new ArrayList<>();
    List positionLengths = new ArrayList<>();
    List startOffsets = new ArrayList<>();
    List endOffsets = new ArrayList<>();
    ts.reset();

    // First pass: save away "correct" tokens
    while (ts.incrementToken()) {
      assertNotNull("has no CharTermAttribute", termAtt);
      tokens.add(termAtt.toString());
      if (typeAtt != null) types.add(typeAtt.type());
      if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
      if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
      if (offsetAtt != null) {
        startOffsets.add(offsetAtt.startOffset());
        endOffsets.add(offsetAtt.endOffset());
      }
    }
    ts.end();
    ts.close();

    // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
    if (!tokens.isEmpty()) {

      // KWTokenizer (for example) can produce a token
      // even when input is length 0:
      if (text.length() != 0) {

        // (Optional) second pass: do something evil:
        final int evilness = random.nextInt(50);
        if (evilness == 17) {
          if (VERBOSE) {
            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
          }
          // Throw an errant exception from the Reader:

          MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
          evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
          reader = evilReader;

          try {
            // NOTE: some Tokenizers go and read characters
            // when you call .setReader(Reader), eg
            // PatternTokenizer.  This is a bit
            // iffy... (really, they should only
            // pull from the Reader when you call
            // .incremenToken(), I think?), but we
            // currently allow it, so, we must call
            // a.tokenStream inside the try since we may
            // hit the exc on init:
            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
            ts.reset();
            while (ts.incrementToken());
            fail("did not hit exception");
          } catch (RuntimeException re) {
            assertTrue(MockReaderWrapper.isMyEvilException(re));
          }
          try {
            ts.end();
          } catch (IllegalStateException ise) {
            // Catch & ignore MockTokenizer's
            // anger...
            if ("end() called before incrementToken() returned false!".equals(ise.getMessage())) {
              // OK
            } else {
              throw ise;
            }
          }
          ts.close();
        } else if (evilness == 7) {
          // Only consume a subset of the tokens:
          final int numTokensToRead = random.nextInt(tokens.size());
          if (VERBOSE) {
            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
          }

          reader = new StringReader(text);
          ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
          ts.reset();
          for(int tokenCount=0;tokenCount list) {
    int ret[] = new int[list.size()];
    int offset = 0;
    for (Integer i : list) {
      ret[offset++] = i;
    }
    return ret;
  }

  protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    mockTokenizer.setReader(input);
    return mockTokenizer;
  }

  protected static MockTokenizer whitespaceMockTokenizer(String input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    mockTokenizer.setReader(new StringReader(input));
    return mockTokenizer;
  }

  protected static MockTokenizer keywordMockTokenizer(Reader input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    mockTokenizer.setReader(input);
    return mockTokenizer;
  }

  protected static MockTokenizer keywordMockTokenizer(String input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    mockTokenizer.setReader(new StringReader(input));
    return mockTokenizer;
  }
  
  /** Returns a random AttributeFactory impl */
  public static AttributeFactory newAttributeFactory(Random random) {
    switch (random.nextInt(3)) {
      case 0:
        return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
      case 1:
        return Token.TOKEN_ATTRIBUTE_FACTORY;
      case 2:
        return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
      default:
        throw new AssertionError("Please fix the Random.nextInt() call above");
    }
  }
  
  /** Returns a random AttributeFactory impl */
  public static AttributeFactory newAttributeFactory() {
    return newAttributeFactory(random());
  }
}