org.apache.lucene.tests.analysis.BaseTokenStreamTestCase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-test-framework Show documentation
Apache Lucene (module: test-framework)
There is a newer version: 7.6.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.tests.analysis;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.Rethrow;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.fst.Util;

/**
 * Base class for all Lucene unit tests that use TokenStreams.
 *
 * When writing unit tests for analysis components, it's highly recommended to use the helper
 * methods here (especially in conjunction with {@link MockAnalyzer} or {@link MockTokenizer}), as
 * they contain many assertions and checks to catch bugs.
 *
 * @see MockAnalyzer
 * @see MockTokenizer
 */
public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
  // some helpers to test Analyzers and TokenStreams:

  /**
   * Attribute that records if it was cleared or not. This is used for testing that
   * clearAttributes() was called correctly.
   */
  public interface CheckClearAttributesAttribute extends Attribute {
    boolean getAndResetClearCalled();
  }

  /**
   * Attribute that records if it was cleared or not. This is used for testing that
   * clearAttributes() was called correctly.
   */
  public static final class CheckClearAttributesAttributeImpl extends AttributeImpl
      implements CheckClearAttributesAttribute {
    private boolean clearCalled = false;

    @Override
    public boolean getAndResetClearCalled() {
      try {
        return clearCalled;
      } finally {
        clearCalled = false;
      }
    }

    @Override
    public void clear() {
      clearCalled = true;
    }

    @Override
    public boolean equals(Object other) {
      return (other instanceof CheckClearAttributesAttributeImpl
          && ((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled);
    }

    @Override
    public int hashCode() {
      return 76137213 ^ Boolean.valueOf(clearCalled).hashCode();
    }

    @Override
    public void copyTo(AttributeImpl target) {
      target.clear();
    }

    @Override
    public void reflectWith(AttributeReflector reflector) {
      reflector.reflect(CheckClearAttributesAttribute.class, "clearCalled", clearCalled);
    }
  }

  // graphOffsetsAreCorrect validates:
  //   - graph offsets are correct (all tokens leaving from
  //     pos X have the same startOffset; all tokens
  //     arriving to pos Y have the same endOffset)
  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      Integer finalPosInc,
      boolean[] keywordAtts,
      boolean graphOffsetsAreCorrect,
      byte[][] payloads,
      int[] flags,
      float[] boost)
      throws IOException {
    assertNotNull(output);
    CheckClearAttributesAttribute checkClearAtt =
        ts.addAttribute(CheckClearAttributesAttribute.class);

    CharTermAttribute termAtt = null;
    if (output.length > 0) {
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      termAtt = ts.getAttribute(CharTermAttribute.class);

      // every UTF-16 character-based TokenStream MUST provide a TermToBytesRefAttribute,
      // implemented by same instance like the CharTermAttribute:
      assertTrue("has no TermToBytesRefAttribute", ts.hasAttribute(TermToBytesRefAttribute.class));
      TermToBytesRefAttribute bytesAtt = ts.getAttribute(TermToBytesRefAttribute.class);

      // ConcatenateGraphFilter has some tricky logic violating this. We have an extra assert there:
      if (!Objects.equals(
          bytesAtt.getClass().getSimpleName(), "BytesRefBuilderTermAttributeImpl")) {
        assertSame(
            "TermToBytesRefAttribute must be implemented by same instance", termAtt, bytesAtt);
      }
    }

    OffsetAttribute offsetAtt = null;
    if (startOffsets != null || endOffsets != null || finalOffset != null) {
      assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
      offsetAtt = ts.getAttribute(OffsetAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (types != null) {
      assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
      typeAtt = ts.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (posIncrements != null || finalPosInc != null) {
      assertTrue(
          "has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
      posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
    }

    PositionLengthAttribute posLengthAtt = null;
    if (posLengths != null) {
      assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }

    KeywordAttribute keywordAtt = null;
    if (keywordAtts != null) {
      assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
      keywordAtt = ts.getAttribute(KeywordAttribute.class);
    }

    PayloadAttribute payloadAtt = null;
    if (payloads != null) {
      assertTrue("has no PayloadAttribute", ts.hasAttribute(PayloadAttribute.class));
      payloadAtt = ts.getAttribute(PayloadAttribute.class);
    }

    FlagsAttribute flagsAtt = null;
    if (flags != null) {
      assertTrue("has no FlagsAttribute", ts.hasAttribute(FlagsAttribute.class));
      flagsAtt = ts.getAttribute(FlagsAttribute.class);
    }

    BoostAttribute boostAtt = null;
    if (boost != null) {
      assertTrue("has no BoostAttribute", ts.hasAttribute(BoostAttribute.class));
      boostAtt = ts.getAttribute(BoostAttribute.class);
    }

    // Maps position to the start/end offset:
    final Map posToStartOffset = new HashMap<>();
    final Map posToEndOffset = new HashMap<>();

    // TODO: would be nice to be able to assert silly duplicated tokens are not created, but a
    // number of cases do this "legitimately": LUCENE-7622

    ts.reset();
    int pos = -1;
    int lastStartOffset = 0;
    for (int i = 0; i < output.length; i++) {
      // extra safety to enforce, that the state is not preserved and also assign bogus values
      ts.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
      if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
      if (typeAtt != null) typeAtt.setType("bogusType");
      if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
      if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
      if (keywordAtt != null) keywordAtt.setKeyword((i & 1) == 0);
      if (payloadAtt != null)
        payloadAtt.setPayload(new BytesRef(new byte[] {0x00, -0x21, 0x12, -0x43, 0x24}));
      if (flagsAtt != null) flagsAtt.setFlags(~0); // all 1's
      if (boostAtt != null) boostAtt.setBoost(-1f);

      checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
      assertTrue("token " + i + " does not exist", ts.incrementToken());
      assertTrue(
          "clearAttributes() was not called correctly in TokenStream chain at token " + i,
          checkClearAtt.getAndResetClearCalled());

      assertEquals("term " + i, output[i], termAtt.toString());
      if (startOffsets != null) {
        assertEquals(
            "startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset());
      }
      if (endOffsets != null) {
        assertEquals("endOffset " + i + " term=" + termAtt, endOffsets[i], offsetAtt.endOffset());
      }
      if (types != null) {
        assertEquals("type " + i + " term=" + termAtt, types[i], typeAtt.type());
      }
      if (posIncrements != null) {
        assertEquals(
            "posIncrement " + i + " term=" + termAtt,
            posIncrements[i],
            posIncrAtt.getPositionIncrement());
      }
      if (posLengths != null) {
        assertEquals(
            "posLength " + i + " term=" + termAtt, posLengths[i], posLengthAtt.getPositionLength());
      }
      if (keywordAtts != null) {
        assertEquals(
            "keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword());
      }
      if (flagsAtt != null) {
        assertEquals("flagsAtt " + i + " term=" + termAtt, flags[i], flagsAtt.getFlags());
      }
      if (boostAtt != null) {
        assertEquals("boostAtt " + i + " term=" + termAtt, boost[i], boostAtt.getBoost(), 0.001);
      }
      if (payloads != null) {
        if (payloads[i] != null) {
          assertEquals("payloads " + i, new BytesRef(payloads[i]), payloadAtt.getPayload());
        } else {
          assertNull("payloads " + i, payloads[i]);
        }
      }
      if (posIncrAtt != null) {
        if (i == 0) {
          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
        } else {
          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
        }
      }
      if (posLengthAtt != null) {
        assertTrue(
            "posLength must be >= 1; got: " + posLengthAtt.getPositionLength(),
            posLengthAtt.getPositionLength() >= 1);
      }
      // we can enforce some basic things about a few attributes even if the caller doesn't check:
      if (offsetAtt != null) {
        final int startOffset = offsetAtt.startOffset();
        final int endOffset = offsetAtt.endOffset();
        if (finalOffset != null) {
          assertTrue(
              "startOffset (= "
                  + startOffset
                  + ") must be <= finalOffset (= "
                  + finalOffset
                  + ") term="
                  + termAtt,
              startOffset <= finalOffset.intValue());
          assertTrue(
              "endOffset must be <= finalOffset: got endOffset="
                  + endOffset
                  + " vs finalOffset="
                  + finalOffset.intValue()
                  + " term="
                  + termAtt,
              endOffset <= finalOffset.intValue());
        }

        assertTrue(
            "offsets must not go backwards startOffset="
                + startOffset
                + " is < lastStartOffset="
                + lastStartOffset
                + " term="
                + termAtt,
            offsetAtt.startOffset() >= lastStartOffset);
        lastStartOffset = offsetAtt.startOffset();

        if (graphOffsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
          // Validate offset consistency in the graph, ie
          // all tokens leaving from a certain pos have the
          // same startOffset, and all tokens arriving to a
          // certain pos have the same endOffset:
          final int posInc = posIncrAtt.getPositionIncrement();
          pos += posInc;

          final int posLength = posLengthAtt.getPositionLength();

          if (!posToStartOffset.containsKey(pos)) {
            // First time we've seen a token leaving from this position:
            posToStartOffset.put(pos, startOffset);
            // System.out.println("  + s " + pos + " -> " + startOffset);
          } else {
            // We've seen a token leaving from this position
            // before; verify the startOffset is the same:
            // System.out.println("  + vs " + pos + " -> " + startOffset);
            assertEquals(
                i
                    + " inconsistent startOffset: pos="
                    + pos
                    + " posLen="
                    + posLength
                    + " token="
                    + termAtt,
                posToStartOffset.get(pos).intValue(),
                startOffset);
          }

          final int endPos = pos + posLength;

          if (!posToEndOffset.containsKey(endPos)) {
            // First time we've seen a token arriving to this position:
            posToEndOffset.put(endPos, endOffset);
            // System.out.println("  + e " + endPos + " -> " + endOffset);
          } else {
            // We've seen a token arriving to this position
            // before; verify the endOffset is the same:
            // System.out.println("  + ve " + endPos + " -> " + endOffset);
            assertEquals(
                "inconsistent endOffset "
                    + i
                    + " pos="
                    + pos
                    + " posLen="
                    + posLength
                    + " token="
                    + termAtt,
                posToEndOffset.get(endPos).intValue(),
                endOffset);
          }
        }
      }
    }

    if (ts.incrementToken()) {
      fail(
          "TokenStream has more tokens than expected (expected count="
              + output.length
              + "); extra token="
              + ts.getAttribute(CharTermAttribute.class));
    }

    // repeat our extra safety checks for end()
    ts.clearAttributes();
    if (termAtt != null) termAtt.setEmpty().append("bogusTerm");
    if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
    if (typeAtt != null) typeAtt.setType("bogusType");
    if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
    if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
    if (keywordAtt != null) keywordAtt.setKeyword(true);
    if (payloadAtt != null)
      payloadAtt.setPayload(new BytesRef(new byte[] {0x00, -0x21, 0x12, -0x43, 0x24}));
    if (flagsAtt != null) flagsAtt.setFlags(~0); // all 1's
    if (boostAtt != null) boostAtt.setBoost(-1);

    checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before

    ts.end();
    assertTrue(
        "super.end()/clearAttributes() was not called correctly in end()",
        checkClearAtt.getAndResetClearCalled());

    if (finalOffset != null) {
      assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset());
    }
    if (offsetAtt != null) {
      assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0);
    }
    if (finalPosInc != null) {
      assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
    }

    ts.close();
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      Integer finalPosInc,
      boolean[] keywordAtts,
      boolean graphOffsetsAreCorrect,
      byte[][] payloads,
      int[] flags)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        finalPosInc,
        keywordAtts,
        graphOffsetsAreCorrect,
        payloads,
        flags,
        null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      boolean[] keywordAtts,
      boolean graphOffsetsAreCorrect)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        keywordAtts,
        graphOffsetsAreCorrect,
        null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      boolean[] keywordAtts,
      boolean graphOffsetsAreCorrect,
      float[] boost)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        null,
        keywordAtts,
        graphOffsetsAreCorrect,
        null,
        null,
        boost);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      Integer finalPosInc,
      boolean[] keywordAtts,
      boolean graphOffsetsAreCorrect,
      byte[][] payloads)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        finalPosInc,
        keywordAtts,
        graphOffsetsAreCorrect,
        payloads,
        null,
        null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      boolean graphOffsetsAreCorrect,
      float[] boost)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        null,
        graphOffsetsAreCorrect,
        boost);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      boolean graphOffsetsAreCorrect)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        null,
        graphOffsetsAreCorrect,
        null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset,
      float[] boost)
      throws IOException {
    assertTokenStreamContents(
        ts,
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        finalOffset,
        true,
        boost);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      Integer finalOffset)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types)
      throws IOException {
    assertTokenStreamContents(ts, output, null, null, types, null, null, null);
  }

  public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements)
      throws IOException {
    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts, String[] output, int[] startOffsets, int[] endOffsets) throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts, String[] output, int[] startOffsets, int[] endOffsets, Integer finalOffset)
      throws IOException {
    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
  }

  public static void assertTokenStreamContents(
      TokenStream ts, String[] output, int[] startOffsets, int[] endOffsets, int[] posIncrements)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      int[] posIncrements,
      Integer finalOffset)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
  }

  public static void assertTokenStreamContents(
      TokenStream ts,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      int[] posIncrements,
      int[] posLengths,
      Integer finalOffset)
      throws IOException {
    assertTokenStreamContents(
        ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
  }

  public static void assertAnalyzesTo(
      Analyzer a,
      String input,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements)
      throws IOException {
    assertTokenStreamContents(
        a.tokenStream("dummy", input),
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        null,
        input.length());
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input);
  }

  public static void assertAnalyzesTo(
      Analyzer a,
      String input,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths)
      throws IOException {
    assertAnalyzesTo(
        a, input, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
  }

  public static void assertAnalyzesTo(
      Analyzer a,
      String input,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      float[] boost)
      throws IOException {
    assertTokenStreamContents(
        a.tokenStream("dummy", input),
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        input.length(),
        boost);
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input);
  }

  public static void assertAnalyzesTo(
      Analyzer a,
      String input,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      boolean graphOffsetsAreCorrect)
      throws IOException {
    assertTokenStreamContents(
        a.tokenStream("dummy", input),
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        input.length(),
        graphOffsetsAreCorrect);
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
  }

  public static void assertAnalyzesTo(
      Analyzer a,
      String input,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      String[] types,
      int[] posIncrements,
      int[] posLengths,
      boolean graphOffsetsAreCorrect,
      byte[][] payloads)
      throws IOException {
    assertTokenStreamContents(
        a.tokenStream("dummy", input),
        output,
        startOffsets,
        endOffsets,
        types,
        posIncrements,
        posLengths,
        input.length(),
        null,
        null,
        graphOffsetsAreCorrect,
        payloads);
    checkResetException(a, input);
    checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output)
      throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, null, null);
  }

  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types)
      throws IOException {
    assertAnalyzesTo(a, input, output, null, null, types, null, null);
  }

  public static void assertAnalyzesTo(
      Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
  }

  public static void assertAnalyzesToPositions(
      Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths)
      throws IOException {
    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
  }

  public static void assertAnalyzesToPositions(
      Analyzer a,
      String input,
      String[] output,
      String[] types,
      int[] posIncrements,
      int[] posLengths)
      throws IOException {
    assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths);
  }

  public static void assertAnalyzesTo(
      Analyzer a, String input, String[] output, int[] startOffsets, int[] endOffsets)
      throws IOException {
    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
  }

  public static void assertAnalyzesTo(
      Analyzer a,
      String input,
      String[] output,
      int[] startOffsets,
      int[] endOffsets,
      int[] posIncrements)
      throws IOException {
    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
  }

  public static void checkResetException(Analyzer a, String input) throws IOException {
    TokenStream ts = a.tokenStream("bogus", input);
    try {
      if (ts.incrementToken()) {
        // System.out.println(ts.reflectAsString(false));
        fail("didn't get expected exception when reset() not called");
      }
    } catch (
        @SuppressWarnings("unused")
        IllegalStateException expected) {
      // ok
    } catch (Exception unexpected) {
      unexpected.printStackTrace(System.err);
      fail("got wrong exception when reset() not called: " + unexpected);
    } finally {
      // consume correctly
      ts.reset();
      while (ts.incrementToken()) {}
      ts.end();
      ts.close();
    }

    // check for a missing close()
    ts = a.tokenStream("bogus", input);
    ts.reset();
    while (ts.incrementToken()) {}
    ts.end();
    try {
      ts = a.tokenStream("bogus", input);
      fail("didn't get expected exception when close() not called");
    } catch (
        @SuppressWarnings("unused")
        IllegalStateException expected) {
      // ok
    } finally {
      ts.close();
    }
  }

  // simple utility method for testing stemmers

  public static void checkOneTerm(Analyzer a, final String input, final String expected)
      throws IOException {
    assertAnalyzesTo(a, input, new String[] {expected});
  }

  /**
   * utility method for blasting tokenstreams with data to make sure they don't do anything crazy
   */
  public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
    checkRandomData(random, a, iterations, 20, false, true);
  }

  /**
   * utility method for blasting tokenstreams with data to make sure they don't do anything crazy
   */
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength)
      throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, false, true);
  }

  /**
   * utility method for blasting tokenstreams with data to make sure they don't do anything crazy
   *
   * @param simple true if only ascii strings will be used (try to avoid)
   */
  public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple)
      throws IOException {
    checkRandomData(random, a, iterations, 20, simple, true);
  }

  /** Asserts that the given stream has expected number of tokens. */
  public static void assertStreamHasNumberOfTokens(TokenStream ts, int expectedCount)
      throws IOException {
    ts.reset();
    int count = 0;
    while (ts.incrementToken()) {
      count++;
    }
    ts.end();
    assertEquals("wrong number of tokens", expectedCount, count);
  }

  static class AnalysisThread extends Thread {
    final int iterations;
    final int maxWordLength;
    final long seed;
    final Analyzer a;
    final boolean useCharFilter;
    final boolean simple;
    final boolean graphOffsetsAreCorrect;
    final RandomIndexWriter iw;
    final CountDownLatch latch;

    // NOTE: not volatile because we don't want the tests to
    // add memory barriers (ie alter how threads
    // interact)... so this is just "best effort":
    public boolean failed;

    AnalysisThread(
        long seed,
        CountDownLatch latch,
        Analyzer a,
        int iterations,
        int maxWordLength,
        boolean useCharFilter,
        boolean simple,
        boolean graphOffsetsAreCorrect,
        RandomIndexWriter iw) {
      this.seed = seed;
      this.a = a;
      this.iterations = iterations;
      this.maxWordLength = maxWordLength;
      this.useCharFilter = useCharFilter;
      this.simple = simple;
      this.graphOffsetsAreCorrect = graphOffsetsAreCorrect;
      this.iw = iw;
      this.latch = latch;
    }

    @Override
    public void run() {
      boolean success = false;
      try {
        latch.await();
        // see the part in checkRandomData where it replays the same text again
        // to verify reproducability/reuse: hopefully this would catch thread hazards.
        checkRandomData(
            new Random(seed),
            a,
            iterations,
            maxWordLength,
            useCharFilter,
            simple,
            graphOffsetsAreCorrect,
            iw);
        success = true;
      } catch (Exception e) {
        Rethrow.rethrow(e);
      } finally {
        failed = !success;
      }
    }
  }

  public static void checkRandomData(
      Random random, Analyzer a, int iterations, int maxWordLength, boolean simple)
      throws IOException {
    checkRandomData(random, a, iterations, maxWordLength, simple, true);
  }

  public static void checkRandomData(
      Random random,
      Analyzer a,
      int iterations,
      int maxWordLength,
      boolean simple,
      boolean graphOffsetsAreCorrect)
      throws IOException {
    checkResetException(a, "best effort");
    long seed = random.nextLong();
    boolean useCharFilter = random.nextBoolean();
    Directory dir = null;
    RandomIndexWriter iw = null;
    final String postingsFormat = TestUtil.getPostingsFormat("dummy");
    boolean codecOk = iterations * maxWordLength < 100000 && !(postingsFormat.equals("SimpleText"));
    if (rarely(random) && codecOk) {
      dir = newFSDirectory(createTempDir("bttc"));
      iw = new RandomIndexWriter(new Random(seed), dir, a);
    }
    boolean success = false;
    try {
      checkRandomData(
          new Random(seed),
          a,
          iterations,
          maxWordLength,
          useCharFilter,
          simple,
          graphOffsetsAreCorrect,
          iw);
      // now test with multiple threads: note we do the EXACT same thing we did before in each
      // thread,
      // so this should only really fail from another thread if it's an actual thread problem
      int numThreads = TestUtil.nextInt(random, 2, 4);
      final CountDownLatch startingGun = new CountDownLatch(1);
      AnalysisThread[] threads = new AnalysisThread[numThreads];
      for (int i = 0; i < threads.length; i++) {
        threads[i] =
            new AnalysisThread(
                seed,
                startingGun,
                a,
                iterations,
                maxWordLength,
                useCharFilter,
                simple,
                graphOffsetsAreCorrect,
                iw);
      }
      for (int i = 0; i < threads.length; i++) {
        threads[i].start();
      }
      startingGun.countDown();
      for (int i = 0; i < threads.length; i++) {
        try {
          threads[i].join();
        } catch (InterruptedException e) {
          throw new RuntimeException(e);
        }
      }
      for (int i = 0; i < threads.length; i++) {
        if (threads[i].failed) {
          throw new RuntimeException("some thread(s) failed");
        }
      }
      if (iw != null) {
        iw.close();
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(dir);
      } else {
        IOUtils.closeWhileHandlingException(dir); // checkindex
      }
    }
  }

  private static void checkRandomData(
      Random random,
      Analyzer a,
      int iterations,
      int maxWordLength,
      boolean useCharFilter,
      boolean simple,
      boolean graphOffsetsAreCorrect,
      RandomIndexWriter iw)
      throws IOException {

    Document doc = null;
    Field field = null, currentField = null;
    StringReader bogus = new StringReader("");
    if (iw != null) {
      doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      if (random.nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random.nextBoolean());
        ft.setStoreTermVectorPositions(random.nextBoolean());
        if (ft.storeTermVectorPositions()) {
          ft.setStoreTermVectorPayloads(random.nextBoolean());
        }
      }
      if (random.nextBoolean()) {
        ft.setOmitNorms(true);
      }
      switch (random.nextInt(4)) {
        case 0:
          ft.setIndexOptions(IndexOptions.DOCS);
          break;
        case 1:
          ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
          break;
        case 2:
          ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
          break;
        default:
          ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
      }
      currentField = field = new Field("dummy", bogus, ft);
      doc.add(currentField);
    }

    for (int i = 0; i < iterations; i++) {
      String text = TestUtil.randomAnalysisString(random, maxWordLength, simple);

      try {
        checkAnalysisConsistency(
            random, a, useCharFilter, text, graphOffsetsAreCorrect, currentField);
        if (iw != null) {
          if (random.nextInt(7) == 0) {
            // pile up a multivalued field
            IndexableFieldType ft = field.fieldType();
            currentField = new Field("dummy", bogus, ft);
            doc.add(currentField);
          } else {
            iw.addDocument(doc);
            if (doc.getFields().size() > 1) {
              // back to 1 field
              currentField = field;
              doc.removeFields("dummy");
              doc.add(currentField);
            }
          }
        }
      } catch (Throwable t) {
        // TODO: really we should pass a random seed to
        // checkAnalysisConsistency then print it here too:
        System.err.println(
            "TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'");
        Rethrow.rethrow(t);
      }
    }
  }

  public static String escape(String s) {
    int charUpto = 0;
    final StringBuilder sb = new StringBuilder();
    while (charUpto < s.length()) {
      final int c = s.charAt(charUpto);
      if (c == 0xa) {
        // Strangely, you cannot put \ u000A into Java
        // sources (not in a comment nor a string
        // constant)...:
        sb.append("\\n");
      } else if (c == 0xd) {
        // ... nor \ u000D:
        sb.append("\\r");
      } else if (c == '"') {
        sb.append("\\\"");
      } else if (c == '\\') {
        sb.append("\\\\");
      } else if (c >= 0x20 && c < 0x80) {
        sb.append((char) c);
      } else {
        // TODO: we can make ascii easier to read if we
        // don't escape...
        sb.append(String.format(Locale.ROOT, "\\u%04x", c));
      }
      charUpto++;
    }
    return sb.toString();
  }

  public static void checkAnalysisConsistency(
      Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
    checkAnalysisConsistency(random, a, useCharFilter, text, true);
  }

  public static void checkAnalysisConsistency(
      Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect)
      throws IOException {
    checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, null);
  }

  private static void checkAnalysisConsistency(
      Random random,
      Analyzer a,
      boolean useCharFilter,
      String text,
      boolean graphOffsetsAreCorrect,
      Field field)
      throws IOException {

    if (VERBOSE) {
      System.out.println(
          Thread.currentThread().getName()
              + ": NOTE: BaseTokenStreamTestCase: get first token stream now text="
              + text);
    }

    int remainder = random.nextInt(10);
    Reader reader = new StringReader(text);
    TokenStream ts =
        a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
    List tokens = new ArrayList<>();
    List types = new ArrayList<>();
    List positions = new ArrayList<>();
    List positionLengths = new ArrayList<>();
    List startOffsets = new ArrayList<>();
    List endOffsets = new ArrayList<>();
    ts.reset();

    // First pass: save away "correct" tokens
    while (ts.incrementToken()) {
      assertNotNull("has no CharTermAttribute", termAtt);
      tokens.add(termAtt.toString());
      if (typeAtt != null) types.add(typeAtt.type());
      if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
      if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
      if (offsetAtt != null) {
        startOffsets.add(offsetAtt.startOffset());
        endOffsets.add(offsetAtt.endOffset());
      }
    }
    ts.end();
    ts.close();

    // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
    if (!tokens.isEmpty()) {

      // KWTokenizer (for example) can produce a token
      // even when input is length 0:
      if (text.length() != 0) {

        // (Optional) second pass: do something evil:
        final int evilness = random.nextInt(50);
        if (evilness == 17) {
          if (VERBOSE) {
            System.out.println(
                Thread.currentThread().getName()
                    + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
          }
          // Throw an errant exception from the Reader:

          MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
          evilReader.throwExcAfterChar(random.nextInt(text.length() + 1));
          reader = evilReader;

          try {
            // NOTE: some Tokenizers go and read characters
            // when you call .setReader(Reader), eg
            // PatternTokenizer.  This is a bit
            // iffy... (really, they should only
            // pull from the Reader when you call
            // .incremenToken(), I think?), but we
            // currently allow it, so, we must call
            // a.tokenStream inside the try since we may
            // hit the exc on init:
            ts =
                a.tokenStream(
                    "dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
            ts.reset();
            while (ts.incrementToken()) {}
            fail("did not hit exception");
          } catch (RuntimeException re) {
            assertTrue(MockReaderWrapper.isMyEvilException(re));
          }
          try {
            ts.end();
          } catch (IllegalStateException ise) {
            // Catch & ignore MockTokenizer's
            // anger...
            if (ise.getMessage().contains("end() called in wrong state=")) {
              // OK
            } else {
              throw ise;
            }
          }
          ts.close();
        } else if (evilness == 7) {
          // Only consume a subset of the tokens:
          final int numTokensToRead = random.nextInt(tokens.size());
          if (VERBOSE) {
            System.out.println(
                Thread.currentThread().getName()
                    + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming "
                    + numTokensToRead
                    + " of "
                    + tokens.size()
                    + " tokens");
          }

          reader = new StringReader(text);
          ts =
              a.tokenStream(
                  "dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
          ts.reset();
          for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++) {
            assertTrue(ts.incrementToken());
          }
          try {
            ts.end();
          } catch (IllegalStateException ise) {
            // Catch & ignore MockTokenizer's
            // anger...
            if (ise.getMessage().contains("end() called in wrong state=")) {
              // OK
            } else {
              throw ise;
            }
          }
          ts.close();
        }
      }
    }

    // Final pass: verify clean tokenization matches
    // results from first pass:

    if (VERBOSE) {
      System.out.println(
          Thread.currentThread().getName()
              + ": NOTE: BaseTokenStreamTestCase: re-run analysis; "
              + tokens.size()
              + " tokens");
    }
    reader = new StringReader(text);

    long seed = random.nextLong();
    random = new Random(seed);
    if (random.nextInt(30) == 7) {
      if (VERBOSE) {
        System.out.println(
            Thread.currentThread().getName()
                + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
      }

      reader = new MockReaderWrapper(random, reader);
    }

    ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
      // offset + pos + posLength + type
      assertTokenStreamContents(
          ts,
          tokens.toArray(new String[tokens.size()]),
          toIntArray(startOffsets),
          toIntArray(endOffsets),
          types.toArray(new String[types.size()]),
          toIntArray(positions),
          toIntArray(positionLengths),
          text.length(),
          graphOffsetsAreCorrect);
    } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
      // offset + pos + type
      assertTokenStreamContents(
          ts,
          tokens.toArray(new String[tokens.size()]),
          toIntArray(startOffsets),
          toIntArray(endOffsets),
          types.toArray(new String[types.size()]),
          toIntArray(positions),
          null,
          text.length(),
          graphOffsetsAreCorrect);
    } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
      // offset + pos + posLength
      assertTokenStreamContents(
          ts,
          tokens.toArray(new String[tokens.size()]),
          toIntArray(startOffsets),
          toIntArray(endOffsets),
          null,
          toIntArray(positions),
          toIntArray(positionLengths),
          text.length(),
          graphOffsetsAreCorrect);
    } else if (posIncAtt != null && offsetAtt != null) {
      // offset + pos
      assertTokenStreamContents(
          ts,
          tokens.toArray(new String[tokens.size()]),
          toIntArray(startOffsets),
          toIntArray(endOffsets),
          null,
          toIntArray(positions),
          null,
          text.length(),
          graphOffsetsAreCorrect);
    } else if (offsetAtt != null) {
      // offset
      assertTokenStreamContents(
          ts,
          tokens.toArray(new String[tokens.size()]),
          toIntArray(startOffsets),
          toIntArray(endOffsets),
          null,
          null,
          null,
          text.length(),
          graphOffsetsAreCorrect);
    } else {
      // terms only
      assertTokenStreamContents(ts, tokens.toArray(new String[tokens.size()]));
    }

    a.normalize("dummy", text);
    // TODO: what can we do besides testing that the above method does not throw?

    if (field != null) {
      reader = new StringReader(text);
      random = new Random(seed);
      if (random.nextInt(30) == 7) {
        if (VERBOSE) {
          System.out.println(
              Thread.currentThread().getName()
                  + ": NOTE: BaseTokenStreamTestCase: indexing using spoon-feed reader");
        }

        reader = new MockReaderWrapper(random, reader);
      }

      field.setReaderValue(useCharFilter ? new MockCharFilter(reader, remainder) : reader);
    }
  }

  protected String toDot(Analyzer a, String inputText) throws IOException {
    final StringWriter sw = new StringWriter();
    final TokenStream ts = a.tokenStream("field", inputText);
    ts.reset();
    new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
    return sw.toString();
  }

  protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
    Writer w = Files.newBufferedWriter(Paths.get(localFileName), StandardCharsets.UTF_8);
    final TokenStream ts = a.tokenStream("field", inputText);
    ts.reset();
    new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
    w.close();
  }

  private static int[] toIntArray(List list) {
    return list.stream().mapToInt(Integer::intValue).toArray();
  }

  protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    mockTokenizer.setReader(input);
    return mockTokenizer;
  }

  protected static MockTokenizer whitespaceMockTokenizer(String input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    mockTokenizer.setReader(new StringReader(input));
    return mockTokenizer;
  }

  protected static MockTokenizer keywordMockTokenizer(Reader input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    mockTokenizer.setReader(input);
    return mockTokenizer;
  }

  protected static MockTokenizer keywordMockTokenizer(String input) throws IOException {
    MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    mockTokenizer.setReader(new StringReader(input));
    return mockTokenizer;
  }

  /** Returns a random AttributeFactory impl */
  public static AttributeFactory newAttributeFactory(Random random) {
    switch (random.nextInt(3)) {
      case 0:
        return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
      case 1:
        return Token.TOKEN_ATTRIBUTE_FACTORY;
      case 2:
        return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
      default:
        throw new AssertionError("Please fix the Random.nextInt() call above");
    }
  }

  /** Returns a random AttributeFactory impl */
  public static AttributeFactory newAttributeFactory() {
    return newAttributeFactory(random());
  }

  private static String toString(Set strings) {
    List stringsList = new ArrayList<>(strings);
    Collections.sort(stringsList);
    StringBuilder b = new StringBuilder();
    for (String s : stringsList) {
      b.append("  ");
      b.append(s);
      b.append('\n');
    }
    return b.toString();
  }

  /**
   * Enumerates all accepted strings in the token graph created by the analyzer on the provided
   * text, and then asserts that it's equal to the expected strings. Uses {@link
   * TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are
   * all and only the given valid strings.
   *
   * @param analyzer analyzer containing the SynonymFilter under test.
   * @param text text to be analyzed.
   * @param expectedStrings all expected finite strings.
   */
  public static void assertGraphStrings(Analyzer analyzer, String text, String... expectedStrings)
      throws IOException {
    checkAnalysisConsistency(random(), analyzer, true, text, true);
    try (TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
      assertGraphStrings(tokenStream, expectedStrings);
    }
  }

  /**
   * Enumerates all accepted strings in the token graph created by the already initialized {@link
   * TokenStream}.
   */
  public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings)
      throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);

    Set expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));

    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set actualStrings = new HashSet<>();
    for (IntsRef ir : actualStringPaths) {
      actualStrings.add(
          Util.toBytesRef(ir, scratchBytesRefBuilder)
              .utf8ToString()
              .replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    for (String s : actualStrings) {
      assertTrue(
          "Analyzer created unexpected string path: "
              + s
              + "\nexpected:\n"
              + toString(expectedStringsSet)
              + "\nactual:\n"
              + toString(actualStrings),
          expectedStringsSet.contains(s));
    }
    for (String s : expectedStrings) {
      assertTrue(
          "Analyzer created unexpected string path: "
              + s
              + "\nexpected:\n"
              + toString(expectedStringsSet)
              + "\nactual:\n"
              + toString(actualStrings),
          actualStrings.contains(s));
    }
  }

  /**
   * Returns all paths accepted by the token stream graph produced by analyzing text with the
   * provided analyzer. The tokens {@link CharTermAttribute} values are concatenated, and separated
   * with space.
   */
  public static Set getGraphStrings(Analyzer analyzer, String text) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
      return getGraphStrings(tokenStream);
    }
  }

  /**
   * Returns all paths accepted by the token stream graph produced by the already initialized {@link
   * TokenStream}.
   */
  public static Set getGraphStrings(TokenStream tokenStream) throws IOException {
    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
    Set actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
    Set paths = new HashSet<>();
    for (IntsRef ir : actualStringPaths) {
      paths.add(
          Util.toBytesRef(ir, scratchBytesRefBuilder)
              .utf8ToString()
              .replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
    }
    return paths;
  }

  /** Returns a {@code String} summary of the tokens this analyzer produces on this text */
  public static String toString(Analyzer analyzer, String text) throws IOException {
    try (TokenStream ts = analyzer.tokenStream("field", text)) {
      StringBuilder b = new StringBuilder();
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
      PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
      assertNotNull(offsetAtt);
      ts.reset();
      int pos = -1;
      while (ts.incrementToken()) {
        pos += posIncAtt.getPositionIncrement();
        b.append(termAtt);
        b.append(" at pos=");
        b.append(pos);
        if (posLengthAtt != null) {
          b.append(" to pos=");
          b.append(pos + posLengthAtt.getPositionLength());
        }
        b.append(" offsets=");
        b.append(offsetAtt.startOffset());
        b.append('-');
        b.append(offsetAtt.endOffset());
        b.append('\n');
      }
      ts.end();
      return b.toString();
    }
  }
}