All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.tests.search.SearchEquivalenceTestBase Maven / Gradle / Ivy

There is a newer version: 7.6.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.tests.search;

import java.util.BitSet;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.junit.AfterClass;
import org.junit.BeforeClass;

/**
 * Simple base class for checking search equivalence. Extend it, and write tests that create {@link
 * #randomTerm()}s (all terms are single characters a-z), and use {@link #assertSameSet(Query,
 * Query)} and {@link #assertSubsetOf(Query, Query)}
 */
@SuppressCodecs("SimpleText")
public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
  protected static IndexSearcher s1, s2;
  protected static Directory directory;
  protected static IndexReader reader;
  protected static Analyzer analyzer;
  protected static String stopword; // we always pick a character as a stopword

  @BeforeClass
  public static void beforeClass() throws Exception {
    Random random = random();
    directory = newDirectory();
    stopword = "" + randomChar();
    CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
    analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
    RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
    Document doc = new Document();
    Field id = new StringField("id", "", Field.Store.NO);
    Field field = new TextField("field", "", Field.Store.NO);
    doc.add(id);
    doc.add(field);

    // index some docs
    int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
    for (int i = 0; i < numDocs; i++) {
      id.setStringValue(Integer.toString(i));
      field.setStringValue(randomFieldContents());
      iw.addDocument(doc);
    }

    // delete some docs
    int numDeletes = numDocs / 20;
    for (int i = 0; i < numDeletes; i++) {
      Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
      if (random.nextBoolean()) {
        iw.deleteDocuments(toDelete);
      } else {
        iw.deleteDocuments(new TermQuery(toDelete));
      }
    }

    reader = iw.getReader();
    s1 = newSearcher(reader);
    // Disable the query cache, which converts two-phase iterators to normal iterators, while we
    // want to make sure two-phase iterators are exercised.
    s1.setQueryCache(null);
    s2 = newSearcher(reader);
    s2.setQueryCache(null);
    iw.close();
  }

  @AfterClass
  public static void afterClass() throws Exception {
    reader.close();
    directory.close();
    analyzer.close();
    reader = null;
    directory = null;
    analyzer = null;
    s1 = s2 = null;
  }

  /**
   * populate a field with random contents. terms should be single characters in lowercase (a-z)
   * tokenization can be assumed to be on whitespace.
   */
  static String randomFieldContents() {
    StringBuilder sb = new StringBuilder();
    int numTerms = random().nextInt(15);
    for (int i = 0; i < numTerms; i++) {
      if (sb.length() > 0) {
        sb.append(' '); // whitespace
      }
      sb.append(randomChar());
    }
    return sb.toString();
  }

  /** returns random character (a-z) */
  static char randomChar() {
    char c = (char) TestUtil.nextInt(random(), 'a', 'z');
    if (random().nextBoolean()) {
      // bias towards earlier chars, so that chars have a ~ zipfian distribution with earlier chars
      // having a higher frequency
      c = (char) TestUtil.nextInt(random(), 'a', c);
    }
    return c;
  }

  /** returns a term suitable for searching. terms are single characters in lowercase (a-z) */
  protected Term randomTerm() {
    return new Term("field", "" + randomChar());
  }

  /** Returns a random filter over the document set */
  protected Query randomFilter() {
    final Query query;
    if (random().nextBoolean()) {
      query = TermRangeQuery.newStringRange("field", "a", "" + randomChar(), true, true);
    } else {
      // use a query with a two-phase approximation
      PhraseQuery phrase = new PhraseQuery(100, "field", "" + randomChar(), "" + randomChar());
      query = phrase;
    }
    return query;
  }

  /**
   * Asserts that the documents returned by q1 are the same as of those returned by
   * q2
   */
  public void assertSameSet(Query q1, Query q2) throws Exception {
    assertSubsetOf(q1, q2);
    assertSubsetOf(q2, q1);
  }

  /**
   * Asserts that the documents returned by q1 are a subset of those returned by 
   * q2
   */
  public void assertSubsetOf(Query q1, Query q2) throws Exception {
    // test without a filter
    assertSubsetOf(q1, q2, null);

    // test with some filters (this will sometimes cause advance'ing enough to test it)
    int numFilters = TEST_NIGHTLY ? atLeast(10) : atLeast(3);
    for (int i = 0; i < numFilters; i++) {
      Query filter = randomFilter();
      // incorporate the filter in different ways.
      assertSubsetOf(q1, q2, filter);
      assertSubsetOf(filteredQuery(q1, filter), filteredQuery(q2, filter), null);
    }
  }

  /**
   * Asserts that the documents returned by q1 are a subset of those returned by 
   * q2.
   *
   * 

Both queries will be filtered by filter */ protected void assertSubsetOf(Query q1, Query q2, Query filter) throws Exception { QueryUtils.check(q1); QueryUtils.check(q2); if (filter != null) { q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build(); q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build(); } // we test both INDEXORDER and RELEVANCE because we want to test needsScores=true/false for (Sort sort : new Sort[] {Sort.INDEXORDER, Sort.RELEVANCE}) { // not efficient, but simple! TopDocs td1 = s1.search(q1, reader.maxDoc(), sort); TopDocs td2 = s2.search(q2, reader.maxDoc(), sort); assertTrue( "too many hits: " + td1.totalHits.value() + " > " + td2.totalHits.value(), td1.totalHits.value() <= td2.totalHits.value()); // fill the superset into a bitset BitSet bitset = new BitSet(); for (int i = 0; i < td2.scoreDocs.length; i++) { bitset.set(td2.scoreDocs[i].doc); } // check in the subset, that every bit was set by the super for (int i = 0; i < td1.scoreDocs.length; i++) { assertTrue(bitset.get(td1.scoreDocs[i].doc)); } } } /** Assert that two queries return the same documents and with the same scores. */ protected void assertSameScores(Query q1, Query q2) throws Exception { assertSameSet(q1, q2); assertSameScores(q1, q2, null); // also test with some filters to test advancing int numFilters = TEST_NIGHTLY ? atLeast(10) : atLeast(3); for (int i = 0; i < numFilters; i++) { Query filter = randomFilter(); // incorporate the filter in different ways. assertSameScores(q1, q2, filter); assertSameScores(filteredQuery(q1, filter), filteredQuery(q2, filter), null); } } protected void assertSameScores(Query q1, Query q2, Query filter) throws Exception { // not efficient, but simple! if (filter != null) { q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build(); q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build(); } TopDocs td1 = s1.search(q1, reader.maxDoc()); TopDocs td2 = s2.search(q2, reader.maxDoc()); assertEquals(td1.totalHits.value(), td2.totalHits.value()); for (int i = 0; i < td1.scoreDocs.length; ++i) { assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc); assertEquals(td1.scoreDocs[i].score, td2.scoreDocs[i].score, 10e-5); } } protected Query filteredQuery(Query query, Query filter) { return new BooleanQuery.Builder().add(query, Occur.MUST).add(filter, Occur.FILTER).build(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy