All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.tests.util.TestUtil Maven / Gradle / Ivy

There is a newer version: 7.6.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.tests.util;

import static java.nio.charset.StandardCharsets.UTF_8;

import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.CharBuffer;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
import org.apache.lucene.codecs.lucene101.Lucene101Codec;
import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowCodecReaderWrapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FilterDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.tests.codecs.asserting.AssertingCodec;
import org.apache.lucene.tests.codecs.blockterms.LuceneFixedGap;
import org.apache.lucene.tests.mockfile.FilterFileSystem;
import org.apache.lucene.tests.mockfile.VirusCheckingFS;
import org.apache.lucene.tests.mockfile.WindowsFS;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.junit.Assert;

/** General utility methods for Lucene unit tests. */
public final class TestUtil {
  private TestUtil() {
    //
  }

  /**
   * A comparator that compares UTF-16 strings / char sequences according to Unicode code point
   * order. This can be used to verify {@link BytesRef} order.
   *
   * 

Warning: This comparator is rather inefficient, because it converts the strings to a * {@code int[]} array on each invocation. */ public static final Comparator STRING_CODEPOINT_COMPARATOR = (a, b) -> { final int[] aCodePoints = a.codePoints().toArray(); final int[] bCodePoints = b.codePoints().toArray(); for (int i = 0, c = Math.min(aCodePoints.length, bCodePoints.length); i < c; i++) { if (aCodePoints[i] < bCodePoints[i]) { return -1; } else if (aCodePoints[i] > bCodePoints[i]) { return 1; } } return aCodePoints.length - bCodePoints.length; }; /** * Convenience method unzipping zipName into destDir. You must pass it a clean destDir. * *

Closes the given InputStream after extracting! */ public static void unzip(InputStream in, Path destDir) throws IOException { in = new BufferedInputStream(in); try (ZipInputStream zipInput = new ZipInputStream(in)) { ZipEntry entry; byte[] buffer = new byte[8192]; while ((entry = zipInput.getNextEntry()) != null) { Path targetFile = destDir.resolve(entry.getName()); // be on the safe side: do not rely on that directories are always extracted // before their children (although this makes sense, but is it guaranteed?) Files.createDirectories(targetFile.getParent()); if (!entry.isDirectory()) { OutputStream out = Files.newOutputStream(targetFile); int len; while ((len = zipInput.read(buffer)) >= 0) { out.write(buffer, 0, len); } out.close(); } zipInput.closeEntry(); } } } /** * Checks that the provided iterator is well-formed. * *

    *
  • is read-only: does not allow {@code remove} *
  • returns {@code expectedSize} number of elements *
  • does not return null elements, unless {@code allowNull} is true. *
  • throws NoSuchElementException if {@code next} is called after {@code hasNext} returns * false. *
*/ public static void checkIterator(Iterator iterator, long expectedSize, boolean allowNull) { for (long i = 0; i < expectedSize; i++) { boolean hasNext = iterator.hasNext(); assert hasNext; T v = iterator.next(); assert allowNull || v != null; // for the first element, check that remove is not supported if (i == 0) { try { iterator.remove(); throw new AssertionError("broken iterator (supports remove): " + iterator); } catch ( @SuppressWarnings("unused") UnsupportedOperationException expected) { // ok } } } assert !iterator.hasNext(); try { iterator.next(); throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator); } catch ( @SuppressWarnings("unused") NoSuchElementException expected) { // ok } } /** * Checks that the provided iterator is well-formed. * *
    *
  • is read-only: does not allow {@code remove} *
  • does not return null elements. *
  • throws NoSuchElementException if {@code next} is called after {@code hasNext} returns * false. *
*/ public static void checkIterator(Iterator iterator) { while (iterator.hasNext()) { T v = iterator.next(); assert v != null; try { iterator.remove(); throw new AssertionError("broken iterator (supports remove): " + iterator); } catch ( @SuppressWarnings("unused") UnsupportedOperationException expected) { // ok } } try { iterator.next(); throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator); } catch ( @SuppressWarnings("unused") NoSuchElementException expected) { // ok } } /** * Checks that the provided collection is read-only. * * @see #checkIterator(Iterator) */ public static void checkReadOnly(Collection coll) { int size = 0; for (@SuppressWarnings("unused") T t : coll) { size += 1; } if (size != coll.size()) { throw new AssertionError( "broken collection, reported size is " + coll.size() + " but iterator has " + size + " elements: " + coll); } if (coll.isEmpty() == false) { try { coll.remove(coll.iterator().next()); throw new AssertionError("broken collection (supports remove): " + coll); } catch ( @SuppressWarnings("unused") UnsupportedOperationException e) { // ok } } try { coll.add(null); throw new AssertionError("broken collection (supports add): " + coll); } catch ( @SuppressWarnings("unused") UnsupportedOperationException e) { // ok } try { coll.addAll(Collections.singleton(null)); throw new AssertionError("broken collection (supports addAll): " + coll); } catch ( @SuppressWarnings("unused") UnsupportedOperationException e) { // ok } checkIterator(coll.iterator()); } public static void syncConcurrentMerges(IndexWriter writer) { syncConcurrentMerges(writer.getConfig().getMergeScheduler()); } public static void syncConcurrentMerges(MergeScheduler ms) { if (ms instanceof ConcurrentMergeScheduler) ((ConcurrentMergeScheduler) ms).sync(); } /** * This runs the CheckIndex tool on the index in. If any issues are hit, a RuntimeException is * thrown; else, true is returned. */ public static CheckIndex.Status checkIndex(Directory dir) throws IOException { return checkIndex(dir, CheckIndex.Level.MIN_LEVEL_FOR_SLOW_CHECKS); } public static CheckIndex.Status checkIndex(Directory dir, int level) throws IOException { return checkIndex(dir, level, false, true, null); } /** * If failFast is true, then throw the first exception when index corruption is hit, instead of * moving on to other fields/segments to look for any other corruption. */ public static CheckIndex.Status checkIndex( Directory dir, int level, boolean failFast, boolean concurrent, ByteArrayOutputStream output) throws IOException { if (output == null) { output = new ByteArrayOutputStream(1024); } // TODO: actually use the dir's locking, unless test uses a special method? // some tests e.g. exception tests become much more complicated if they have to close the writer try (CheckIndex checker = new CheckIndex(dir, NoLockFactory.INSTANCE.obtainLock(dir, "bogus"))) { checker.setLevel(level); checker.setFailFast(failFast); checker.setInfoStream(new PrintStream(output, false, UTF_8), false); if (concurrent) { checker.setThreadCount(RandomizedTest.randomIntBetween(2, 5)); } else { checker.setThreadCount(1); } CheckIndex.Status indexStatus = checker.checkIndex(null); if (indexStatus == null || indexStatus.clean == false) { System.out.println("CheckIndex failed"); System.out.println(output.toString(UTF_8)); throw new RuntimeException("CheckIndex failed"); } else { if (LuceneTestCase.INFOSTREAM) { System.out.println(output.toString(UTF_8)); } return indexStatus; } } } /** * This runs the CheckIndex tool on the Reader. If any issues are hit, a RuntimeException is * thrown */ public static void checkReader(IndexReader reader) throws IOException { for (LeafReaderContext context : reader.leaves()) { checkReader(context.reader(), CheckIndex.Level.MIN_LEVEL_FOR_SLOW_CHECKS); } } public static void checkReader(LeafReader reader, int level) throws IOException { ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); PrintStream infoStream = new PrintStream(bos, false, UTF_8); final CodecReader codecReader; if (reader instanceof CodecReader) { codecReader = (CodecReader) reader; reader.checkIntegrity(); } else { codecReader = SlowCodecReaderWrapper.wrap(reader); } CheckIndex.testLiveDocs(codecReader, infoStream, true); CheckIndex.testFieldInfos(codecReader, infoStream, true); CheckIndex.testFieldNorms(codecReader, infoStream, true); CheckIndex.testPostings(codecReader, infoStream, false, level, true); CheckIndex.testStoredFields(codecReader, infoStream, true); CheckIndex.testTermVectors(codecReader, infoStream, false, level, true); CheckIndex.testDocValues(codecReader, infoStream, true); CheckIndex.testPoints(codecReader, infoStream, true); // some checks really against the reader API checkReaderSanity(reader); if (LuceneTestCase.INFOSTREAM) { System.out.println(bos.toString(UTF_8)); } // FieldInfos should be cached at the reader and always return the same instance if (reader.getFieldInfos() != reader.getFieldInfos()) { throw new RuntimeException( "getFieldInfos() returned different instances for class: " + reader.getClass()); } } // used by TestUtil.checkReader to check some things really unrelated to the index, // just looking for bugs in indexreader implementations. private static void checkReaderSanity(LeafReader reader) throws IOException { for (FieldInfo info : reader.getFieldInfos()) { // reader shouldn't return normValues if the field does not have them if (!info.hasNorms()) { if (reader.getNormValues(info.name) != null) { throw new RuntimeException("field: " + info.name + " should omit norms but has them!"); } } // reader shouldn't return docValues if the field does not have them // reader shouldn't return multiple docvalues types for the same field. switch (info.getDocValuesType()) { case NONE: if (reader.getBinaryDocValues(info.name) != null || reader.getNumericDocValues(info.name) != null || reader.getSortedDocValues(info.name) != null || reader.getSortedSetDocValues(info.name) != null) { throw new RuntimeException( "field: " + info.name + " has docvalues but should omit them!"); } break; case SORTED: if (reader.getBinaryDocValues(info.name) != null || reader.getNumericDocValues(info.name) != null || reader.getSortedNumericDocValues(info.name) != null || reader.getSortedSetDocValues(info.name) != null) { throw new RuntimeException(info.name + " returns multiple docvalues types!"); } break; case SORTED_NUMERIC: if (reader.getBinaryDocValues(info.name) != null || reader.getNumericDocValues(info.name) != null || reader.getSortedSetDocValues(info.name) != null || reader.getSortedDocValues(info.name) != null) { throw new RuntimeException(info.name + " returns multiple docvalues types!"); } break; case SORTED_SET: if (reader.getBinaryDocValues(info.name) != null || reader.getNumericDocValues(info.name) != null || reader.getSortedNumericDocValues(info.name) != null || reader.getSortedDocValues(info.name) != null) { throw new RuntimeException(info.name + " returns multiple docvalues types!"); } break; case BINARY: if (reader.getNumericDocValues(info.name) != null || reader.getSortedDocValues(info.name) != null || reader.getSortedNumericDocValues(info.name) != null || reader.getSortedSetDocValues(info.name) != null) { throw new RuntimeException(info.name + " returns multiple docvalues types!"); } break; case NUMERIC: if (reader.getBinaryDocValues(info.name) != null || reader.getSortedDocValues(info.name) != null || reader.getSortedNumericDocValues(info.name) != null || reader.getSortedSetDocValues(info.name) != null) { throw new RuntimeException(info.name + " returns multiple docvalues types!"); } break; default: throw new AssertionError(); } } } /** * Returns true if the arguments are equal or within the range of allowed error (inclusive). * Returns {@code false} if either of the arguments is NaN. * *

Two float numbers are considered equal if there are {@code (maxUlps - 1)} (or fewer) * floating point numbers between them, i.e. two adjacent floating point numbers are considered * equal. * *

Adapted from org.apache.commons.numbers.core.Precision * *

github: https://github.com/apache/commons-numbers release 1.2 * * @param x first value * @param y second value * @param maxUlps {@code (maxUlps - 1)} is the number of floating point values between {@code x} * and {@code y}. * @return {@code true} if there are fewer than {@code maxUlps} floating point values between * {@code x} and {@code y}. */ public static boolean floatUlpEquals(final float x, final float y, final short maxUlps) { final int xInt = Float.floatToRawIntBits(x); final int yInt = Float.floatToRawIntBits(y); if ((xInt ^ yInt) < 0) { // Numbers have opposite signs, take care of overflow. // Remove the sign bit to obtain the absolute ULP above zero. final int deltaPlus = xInt & Integer.MAX_VALUE; final int deltaMinus = yInt & Integer.MAX_VALUE; // Note: // If either value is NaN, the exponent bits are set to (255 << 23) and the // distance above 0.0 is always above a short ULP error. So omit the test // for NaN and return directly. // Avoid possible overflow from adding the deltas by splitting the comparison return deltaPlus <= maxUlps && deltaMinus <= (maxUlps - deltaPlus); } // Numbers have same sign, there is no risk of overflow. return Math.abs(xInt - yInt) <= maxUlps && !Float.isNaN(x) && !Float.isNaN(y); } /** * Returns true if the arguments are equal or within the range of allowed error (inclusive). * Returns {@code false} if either of the arguments is NaN. * *

Two double numbers are considered equal if there are {@code (maxUlps - 1)} (or fewer) * floating point numbers between them, i.e. two adjacent floating point numbers are considered * equal. * *

Adapted from org.apache.commons.numbers.core.Precision * *

github: https://github.com/apache/commons-numbers release 1.2 * * @param x first value * @param y second value * @param maxUlps {@code (maxUlps - 1)} is the number of floating point values between {@code x} * and {@code y}. * @return {@code true} if there are fewer than {@code maxUlps} floating point values between * {@code x} and {@code y}. */ public static boolean doubleUlpEquals(final double x, final double y, final int maxUlps) { final long xInt = Double.doubleToRawLongBits(x); final long yInt = Double.doubleToRawLongBits(y); if ((xInt ^ yInt) < 0) { // Numbers have opposite signs, take care of overflow. // Remove the sign bit to obtain the absolute ULP above zero. final long deltaPlus = xInt & Long.MAX_VALUE; final long deltaMinus = yInt & Long.MAX_VALUE; // Note: // If either value is NaN, the exponent bits are set to (2047L << 52) and the // distance above 0.0 is always above an integer ULP error. So omit the test // for NaN and return directly. // Avoid possible overflow from adding the deltas by splitting the comparison return deltaPlus <= maxUlps && deltaMinus <= (maxUlps - deltaPlus); } // Numbers have same sign, there is no risk of overflow. return Math.abs(xInt - yInt) <= maxUlps && !Double.isNaN(x) && !Double.isNaN(y); } /** start and end are BOTH inclusive */ public static int nextInt(Random r, int start, int end) { return RandomNumbers.randomIntBetween(r, start, end); } /** start and end are BOTH inclusive */ public static long nextLong(Random r, long start, long end) { assert end >= start : "start=" + start + ",end=" + end; final BigInteger range = BigInteger.valueOf(end).add(BigInteger.valueOf(1)).subtract(BigInteger.valueOf(start)); if (range.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) <= 0) { return start + r.nextInt(range.intValue()); } else { // probably not evenly distributed when range is large, but OK for tests final BigInteger augend = new BigDecimal(range).multiply(new BigDecimal(r.nextDouble())).toBigInteger(); final long result = BigInteger.valueOf(start).add(augend).longValue(); assert result >= start; assert result <= end; return result; } } /** Returns a randomish big integer with {@code 1 .. maxBytes} storage. */ public static BigInteger nextBigInteger(Random random, int maxBytes) { int length = TestUtil.nextInt(random, 1, maxBytes); byte[] buffer = new byte[length]; random.nextBytes(buffer); return new BigInteger(buffer); } public static String randomSimpleString(Random r, int maxLength) { return randomSimpleString(r, 0, maxLength); } public static String randomSimpleString(Random r, int minLength, int maxLength) { final int end = nextInt(r, minLength, maxLength); if (end == 0) { // allow 0 length return ""; } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { buffer[i] = (char) TestUtil.nextInt(r, 'a', 'z'); } return new String(buffer, 0, end); } public static String randomSimpleStringRange( Random r, char minChar, char maxChar, int maxLength) { final int end = nextInt(r, 0, maxLength); if (end == 0) { // allow 0 length return ""; } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { buffer[i] = (char) TestUtil.nextInt(r, minChar, maxChar); } return new String(buffer, 0, end); } public static String randomSimpleString(Random r) { return randomSimpleString(r, 0, 10); } /** Returns random string, including full unicode range. */ public static String randomUnicodeString(Random r) { return randomUnicodeString(r, 20); } /** Returns a random string up to a certain length. */ public static String randomUnicodeString(Random r, int maxLength) { final int end = nextInt(r, 0, maxLength); if (end == 0) { // allow 0 length return ""; } final char[] buffer = new char[end]; randomFixedLengthUnicodeString(r, buffer, 0, buffer.length); return new String(buffer, 0, end); } /** Fills provided char[] with valid random unicode code unit sequence. */ public static void randomFixedLengthUnicodeString( Random random, char[] chars, int offset, int length) { int i = offset; final int end = offset + length; while (i < end) { final int t = random.nextInt(5); if (0 == t && i < length - 1) { // Make a surrogate pair // High surrogate chars[i++] = (char) nextInt(random, 0xd800, 0xdbff); // Low surrogate chars[i++] = (char) nextInt(random, 0xdc00, 0xdfff); } else if (t <= 1) { chars[i++] = (char) random.nextInt(0x80); } else if (2 == t) { chars[i++] = (char) nextInt(random, 0x80, 0x7ff); } else if (3 == t) { chars[i++] = (char) nextInt(random, 0x800, 0xd7ff); } else if (4 == t) { chars[i++] = (char) nextInt(random, 0xe000, 0xffff); } } } /** * Returns a String that's "regexpish" (contains lots of operators typically found in regular * expressions) If you call this enough times, you might get a valid regex! */ public static String randomRegexpishString(Random r) { return randomRegexpishString(r, 20); } /** * Maximum recursion bound for '+' and '*' replacements in {@link #randomRegexpishString(Random, * int)}. */ private static final int maxRecursionBound = 5; /** Operators for {@link #randomRegexpishString(Random, int)}. */ private static final List ops = Arrays.asList( ".", "?", "{0," + maxRecursionBound + "}", // bounded replacement for '*' "{1," + maxRecursionBound + "}", // bounded replacement for '+' "(", ")", "-", "[", "]", "|"); /** * Returns a String that's "regexpish" (contains lots of operators typically found in regular * expressions) If you call this enough times, you might get a valid regex! * *

Note: to avoid practically endless backtracking patterns we replace asterisk and plus * operators with bounded repetitions. See LUCENE-4111 for more info. * * @param maxLength A hint about maximum length of the regexpish string. It may be exceeded by a * few characters. */ public static String randomRegexpishString(Random r, int maxLength) { final StringBuilder regexp = new StringBuilder(maxLength); for (int i = nextInt(r, 0, maxLength); i > 0; i--) { if (r.nextBoolean()) { regexp.append((char) RandomNumbers.randomIntBetween(r, 'a', 'z')); } else { regexp.append(RandomPicks.randomFrom(r, ops)); } } return regexp.toString(); } private static final String[] HTML_CHAR_ENTITIES = { "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde", "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH", "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT", "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT", "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega", "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi", "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym", "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde", "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup", "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide", "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon", "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall", "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr", "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave", "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa", "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le", "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr", "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash", "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc", "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf", "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil", "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop", "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil", "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo", "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim", "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe", "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn", "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave", "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen", "yuml", "zeta", "zwj", "zwnj" }; public static String randomHtmlishString(Random random, int numElements) { final int end = nextInt(random, 0, numElements); if (end == 0) { // allow 0 length return ""; } StringBuilder sb = new StringBuilder(); for (int i = 0; i < end; i++) { int val = random.nextInt(25); switch (val) { case 0: sb.append("

"); break; case 1: { sb.append("<"); sb.append(" ".substring(nextInt(random, 0, 4))); sb.append(randomSimpleString(random)); for (int j = 0; j < nextInt(random, 0, 10); ++j) { sb.append(' '); sb.append(randomSimpleString(random)); sb.append(" ".substring(nextInt(random, 0, 1))); sb.append('='); sb.append(" ".substring(nextInt(random, 0, 1))); sb.append("\"".substring(nextInt(random, 0, 1))); sb.append(randomSimpleString(random)); sb.append("\"".substring(nextInt(random, 0, 1))); } sb.append(" ".substring(nextInt(random, 0, 4))); sb.append("/".substring(nextInt(random, 0, 1))); sb.append(">".substring(nextInt(random, 0, 1))); break; } case 2: { sb.append("".substring(nextInt(random, 0, 1))); break; } case 3: sb.append(">"); break; case 4: sb.append("

"); break; case 5: sb.append(""); break; case 16: { sb.append("&"); switch (nextInt(random, 0, 2)) { case 0: sb.append(randomSimpleString(random)); break; case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break; } sb.append(";".substring(nextInt(random, 0, 1))); break; } case 17: { sb.append("&#"); if (0 == nextInt(random, 0, 1)) { sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); sb.append(";".substring(nextInt(random, 0, 1))); } break; } case 18: { sb.append("&#x"); if (0 == nextInt(random, 0, 1)) { sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16)); sb.append(";".substring(nextInt(random, 0, 1))); } break; } case 19: sb.append(";"); break; case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break; case 21: sb.append("\n"); break; case 22: sb.append(" ".substring(nextInt(random, 0, 10))); break; case 23: { sb.append("<"); if (0 == nextInt(random, 0, 3)) { sb.append(" ".substring(nextInt(random, 1, 10))); } if (0 == nextInt(random, 0, 1)) { sb.append("/"); if (0 == nextInt(random, 0, 3)) { sb.append(" ".substring(nextInt(random, 1, 10))); } } switch (nextInt(random, 0, 3)) { case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break; case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break; case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break; // default: append nothing } sb.append(">".substring(nextInt(random, 0, 1))); break; } default: sb.append(randomSimpleString(random)); } } return sb.toString(); } /** Randomly upcases, downcases, or leaves intact each code point in the given string */ public static String randomlyRecaseCodePoints(Random random, String str) { StringBuilder builder = new StringBuilder(); int pos = 0; while (pos < str.length()) { int codePoint = str.codePointAt(pos); pos += Character.charCount(codePoint); switch (nextInt(random, 0, 2)) { case 0: builder.appendCodePoint(Character.toUpperCase(codePoint)); break; case 1: builder.appendCodePoint(Character.toLowerCase(codePoint)); break; case 2: builder.appendCodePoint(codePoint); // leave intact } } return builder.toString(); } private static final int[] blockStarts = { 0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, 0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80, 0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x1000, 0x10A0, 0x1100, 0x1200, 0x1380, 0x13A0, 0x1400, 0x1680, 0x16A0, 0x1700, 0x1720, 0x1740, 0x1760, 0x1780, 0x1800, 0x18B0, 0x1900, 0x1950, 0x1980, 0x19E0, 0x1A00, 0x1A20, 0x1B00, 0x1B80, 0x1C00, 0x1C50, 0x1CD0, 0x1D00, 0x1D80, 0x1DC0, 0x1E00, 0x1F00, 0x2000, 0x2070, 0x20A0, 0x20D0, 0x2100, 0x2150, 0x2190, 0x2200, 0x2300, 0x2400, 0x2440, 0x2460, 0x2500, 0x2580, 0x25A0, 0x2600, 0x2700, 0x27C0, 0x27F0, 0x2800, 0x2900, 0x2980, 0x2A00, 0x2B00, 0x2C00, 0x2C60, 0x2C80, 0x2D00, 0x2D30, 0x2D80, 0x2DE0, 0x2E00, 0x2E80, 0x2F00, 0x2FF0, 0x3000, 0x3040, 0x30A0, 0x3100, 0x3130, 0x3190, 0x31A0, 0x31C0, 0x31F0, 0x3200, 0x3300, 0x3400, 0x4DC0, 0x4E00, 0xA000, 0xA490, 0xA4D0, 0xA500, 0xA640, 0xA6A0, 0xA700, 0xA720, 0xA800, 0xA830, 0xA840, 0xA880, 0xA8E0, 0xA900, 0xA930, 0xA960, 0xA980, 0xAA00, 0xAA60, 0xAA80, 0xABC0, 0xAC00, 0xD7B0, 0xE000, 0xF900, 0xFB00, 0xFB50, 0xFE00, 0xFE10, 0xFE20, 0xFE30, 0xFE50, 0xFE70, 0xFF00, 0xFFF0, 0x10000, 0x10080, 0x10100, 0x10140, 0x10190, 0x101D0, 0x10280, 0x102A0, 0x10300, 0x10330, 0x10380, 0x103A0, 0x10400, 0x10450, 0x10480, 0x10800, 0x10840, 0x10900, 0x10920, 0x10A00, 0x10A60, 0x10B00, 0x10B40, 0x10B60, 0x10C00, 0x10E60, 0x11080, 0x12000, 0x12400, 0x13000, 0x1D000, 0x1D100, 0x1D200, 0x1D300, 0x1D360, 0x1D400, 0x1F000, 0x1F030, 0x1F100, 0x1F200, 0x20000, 0x2A700, 0x2F800, 0xE0000, 0xE0100, 0xF0000, 0x100000 }; private static final int[] blockEnds = { 0x007F, 0x00FF, 0x017F, 0x024F, 0x02AF, 0x02FF, 0x036F, 0x03FF, 0x04FF, 0x052F, 0x058F, 0x05FF, 0x06FF, 0x074F, 0x077F, 0x07BF, 0x07FF, 0x083F, 0x097F, 0x09FF, 0x0A7F, 0x0AFF, 0x0B7F, 0x0BFF, 0x0C7F, 0x0CFF, 0x0D7F, 0x0DFF, 0x0E7F, 0x0EFF, 0x0FFF, 0x109F, 0x10FF, 0x11FF, 0x137F, 0x139F, 0x13FF, 0x167F, 0x169F, 0x16FF, 0x171F, 0x173F, 0x175F, 0x177F, 0x17FF, 0x18AF, 0x18FF, 0x194F, 0x197F, 0x19DF, 0x19FF, 0x1A1F, 0x1AAF, 0x1B7F, 0x1BBF, 0x1C4F, 0x1C7F, 0x1CFF, 0x1D7F, 0x1DBF, 0x1DFF, 0x1EFF, 0x1FFF, 0x206F, 0x209F, 0x20CF, 0x20FF, 0x214F, 0x218F, 0x21FF, 0x22FF, 0x23FF, 0x243F, 0x245F, 0x24FF, 0x257F, 0x259F, 0x25FF, 0x26FF, 0x27BF, 0x27EF, 0x27FF, 0x28FF, 0x297F, 0x29FF, 0x2AFF, 0x2BFF, 0x2C5F, 0x2C7F, 0x2CFF, 0x2D2F, 0x2D7F, 0x2DDF, 0x2DFF, 0x2E7F, 0x2EFF, 0x2FDF, 0x2FFF, 0x303F, 0x309F, 0x30FF, 0x312F, 0x318F, 0x319F, 0x31BF, 0x31EF, 0x31FF, 0x32FF, 0x33FF, 0x4DBF, 0x4DFF, 0x9FFF, 0xA48F, 0xA4CF, 0xA4FF, 0xA63F, 0xA69F, 0xA6FF, 0xA71F, 0xA7FF, 0xA82F, 0xA83F, 0xA87F, 0xA8DF, 0xA8FF, 0xA92F, 0xA95F, 0xA97F, 0xA9DF, 0xAA5F, 0xAA7F, 0xAADF, 0xABFF, 0xD7AF, 0xD7FF, 0xF8FF, 0xFAFF, 0xFB4F, 0xFDFF, 0xFE0F, 0xFE1F, 0xFE2F, 0xFE4F, 0xFE6F, 0xFEFF, 0xFFEF, 0xFFFF, 0x1007F, 0x100FF, 0x1013F, 0x1018F, 0x101CF, 0x101FF, 0x1029F, 0x102DF, 0x1032F, 0x1034F, 0x1039F, 0x103DF, 0x1044F, 0x1047F, 0x104AF, 0x1083F, 0x1085F, 0x1091F, 0x1093F, 0x10A5F, 0x10A7F, 0x10B3F, 0x10B5F, 0x10B7F, 0x10C4F, 0x10E7F, 0x110CF, 0x123FF, 0x1247F, 0x1342F, 0x1D0FF, 0x1D1FF, 0x1D24F, 0x1D35F, 0x1D37F, 0x1D7FF, 0x1F02F, 0x1F09F, 0x1F1FF, 0x1F2FF, 0x2A6DF, 0x2B73F, 0x2FA1F, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF }; /** * Returns random string of length between 0-20 codepoints, all codepoints within the same unicode * block. */ public static String randomRealisticUnicodeString(Random r) { return randomRealisticUnicodeString(r, 20); } /** * Returns random string of length up to maxLength codepoints , all codepoints within the same * unicode block. */ public static String randomRealisticUnicodeString(Random r, int maxLength) { return randomRealisticUnicodeString(r, 0, maxLength); } /** * Returns random string of length between min and max codepoints, all codepoints within the same * unicode block. */ public static String randomRealisticUnicodeString(Random r, int minLength, int maxLength) { final int end = nextInt(r, minLength, maxLength); final int block = r.nextInt(blockStarts.length); StringBuilder sb = new StringBuilder(); for (int i = 0; i < end; i++) sb.appendCodePoint(nextInt(r, blockStarts[block], blockEnds[block])); return sb.toString(); } /** Returns random string, with a given UTF-8 byte length */ public static String randomFixedByteLengthUnicodeString(Random r, int length) { final char[] buffer = new char[length * 3]; int bytes = length; int i = 0; for (; i < buffer.length && bytes != 0; i++) { int t; if (bytes >= 4) { t = r.nextInt(5); } else if (bytes == 3) { t = r.nextInt(4); } else if (bytes == 2) { t = r.nextInt(2); } else { t = 0; } if (t == 0) { buffer[i] = (char) r.nextInt(0x80); bytes--; } else if (1 == t) { buffer[i] = (char) nextInt(r, 0x80, 0x7ff); bytes -= 2; } else if (2 == t) { buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); bytes -= 3; } else if (3 == t) { buffer[i] = (char) nextInt(r, 0xe000, 0xffff); bytes -= 3; } else if (4 == t) { // Make a surrogate pair // High surrogate buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); // Low surrogate buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff); bytes -= 4; } } return new String(buffer, 0, i); } /** Returns a random binary term. */ public static BytesRef randomBinaryTerm(Random r) { return randomBinaryTerm(r, r.nextInt(15)); } /** Returns a random binary with a given length */ public static BytesRef randomBinaryTerm(Random r, int length) { BytesRef b = new BytesRef(length); r.nextBytes(b.bytes); b.length = length; return b; } /** * Return a Codec that can read any of the default codecs and formats, but always writes in the * specified format. */ public static Codec alwaysPostingsFormat(final PostingsFormat format) { // TODO: we really need for postings impls etc to announce themselves // (and maybe their params, too) to infostream on flush and merge. // otherwise in a real debugging situation we won't know whats going on! if (LuceneTestCase.VERBOSE) { System.out.println("forcing postings format to:" + format); } return new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return format; } }; } /** * Return a Codec that can read any of the default codecs and formats, but always writes in the * specified format. */ public static Codec alwaysDocValuesFormat(final DocValuesFormat format) { // TODO: we really need for docvalues impls etc to announce themselves // (and maybe their params, too) to infostream on flush and merge. // otherwise in a real debugging situation we won't know whats going on! if (LuceneTestCase.VERBOSE) { System.out.println("TestUtil: forcing docvalues format to:" + format); } return new AssertingCodec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { return format; } }; } /** * Return a Codec that can read any of the default codecs and formats, but always writes in the * specified format. */ public static Codec alwaysKnnVectorsFormat(final KnnVectorsFormat format) { // TODO: we really need for knn vectors impls etc to announce themselves // (and maybe their params, too) to infostream on flush and merge. // otherwise in a real debugging situation we won't know whats going on! if (LuceneTestCase.VERBOSE) { System.out.println("TestUtil: forcing knn vectors format to:" + format); } return new AssertingCodec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return format; } }; } /** * Returns the actual default codec (e.g. LuceneMNCodec) for this version of Lucene. This may be * different from {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { return new Lucene101Codec(); } /** * Returns the actual default postings format (e.g. LuceneMNPostingsFormat) for this version of * Lucene. */ public static PostingsFormat getDefaultPostingsFormat() { return new Lucene101PostingsFormat(); } /** * Returns the actual default postings format (e.g. LuceneMNPostingsFormat) for this version of * Lucene. * * @lucene.internal this may disappear at any time */ public static PostingsFormat getDefaultPostingsFormat( int minItemsPerBlock, int maxItemsPerBlock) { return new Lucene101PostingsFormat(minItemsPerBlock, maxItemsPerBlock); } /** Returns a random postings format that supports term ordinals */ public static PostingsFormat getPostingsFormatWithOrds(Random r) { switch (r.nextInt(2)) { case 0: return new LuceneFixedGap(); case 1: return new BlockTreeOrdsPostingsFormat(); // TODO: these don't actually support ords! // case 2: return new FSTOrdPostingsFormat(); default: throw new AssertionError(); } } /** * Returns the actual default docvalues format (e.g. LuceneMNDocValuesFormat) for this version of * Lucene. */ public static DocValuesFormat getDefaultDocValuesFormat() { return new Lucene90DocValuesFormat(); } // TODO: generalize all 'test-checks-for-crazy-codecs' to // annotations (LUCENE-3489) public static String getPostingsFormat(String field) { return getPostingsFormat(Codec.getDefault(), field); } public static String getPostingsFormat(Codec codec, String field) { PostingsFormat p = codec.postingsFormat(); if (p instanceof PerFieldPostingsFormat) { return ((PerFieldPostingsFormat) p).getPostingsFormatForField(field).getName(); } else { return p.getName(); } } public static String getDocValuesFormat(String field) { return getDocValuesFormat(Codec.getDefault(), field); } public static String getDocValuesFormat(Codec codec, String field) { DocValuesFormat f = codec.docValuesFormat(); if (f instanceof PerFieldDocValuesFormat) { return ((PerFieldDocValuesFormat) f).getDocValuesFormatForField(field).getName(); } else { return f.getName(); } } // TODO: remove this, push this test to Lucene40/Lucene42 codec tests public static boolean fieldSupportsHugeBinaryDocValues(String field) { String dvFormat = getDocValuesFormat(field); if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) { return false; } return true; } /** * Returns the actual default vector format (e.g. LuceneMNKnnVectorsFormat) for this version of * Lucene. */ public static KnnVectorsFormat getDefaultKnnVectorsFormat() { return new Lucene99HnswVectorsFormat(); } public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException { String[] files = dir.listAll(); if (files.length > 1 || (files.length == 1 && !files[0].equals("write.lock"))) { return true; } else { return false; } } public static void addIndexesSlowly(IndexWriter writer, DirectoryReader... readers) throws IOException { List leaves = new ArrayList<>(); for (DirectoryReader reader : readers) { for (LeafReaderContext context : reader.leaves()) { leaves.add(SlowCodecReaderWrapper.wrap(context.reader())); } } writer.addIndexes(leaves.toArray(new CodecReader[0])); } /** just tries to configure things to keep the open file count lowish */ public static void reduceOpenFiles(IndexWriter w) { // keep number of open files lowish MergePolicy mp = w.getConfig().getMergePolicy(); mp.setNoCFSRatio(1.0); if (mp instanceof LogMergePolicy) { LogMergePolicy lmp = (LogMergePolicy) mp; lmp.setMergeFactor(Math.min(5, lmp.getMergeFactor())); } else if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; tmp.setMaxMergeAtOnce(Math.min(5, tmp.getMaxMergeAtOnce())); tmp.setSegmentsPerTier(Math.min(5, tmp.getSegmentsPerTier())); } MergeScheduler ms = w.getConfig().getMergeScheduler(); if (ms instanceof ConcurrentMergeScheduler) { // wtf... shouldn't it be even lower since it's 1 by default?!?! ((ConcurrentMergeScheduler) ms).setMaxMergesAndThreads(3, 2); } } /** * Checks some basic behaviour of an AttributeImpl * * @param reflectedValues contains a map with "AttributeClass#key" as values */ public static void assertAttributeReflection( final AttributeImpl att, Map reflectedValues) { final Map map = new HashMap<>(); att.reflectWith((attClass, key, value) -> map.put(attClass.getName() + '#' + key, value)); Assert.assertEquals("Reflection does not produce same map", reflectedValues, map); } /** Assert that the given {@link TopDocs} have the same top docs and consistent hit counts. */ public static void assertConsistent(TopDocs expected, TopDocs actual) { Assert.assertEquals( "wrong total hits", expected.totalHits.value() == 0, actual.totalHits.value() == 0); if (expected.totalHits.relation() == TotalHits.Relation.EQUAL_TO) { if (actual.totalHits.relation() == TotalHits.Relation.EQUAL_TO) { Assert.assertEquals( "wrong total hits", expected.totalHits.value(), actual.totalHits.value()); } else { Assert.assertTrue( "wrong total hits", expected.totalHits.value() >= actual.totalHits.value()); } } else if (actual.totalHits.relation() == TotalHits.Relation.EQUAL_TO) { Assert.assertTrue("wrong total hits", expected.totalHits.value() <= actual.totalHits.value()); } Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length); for (int hitIDX = 0; hitIDX < expected.scoreDocs.length; hitIDX++) { final ScoreDoc expectedSD = expected.scoreDocs[hitIDX]; final ScoreDoc actualSD = actual.scoreDocs[hitIDX]; Assert.assertEquals("wrong hit docID", expectedSD.doc, actualSD.doc); Assert.assertEquals("wrong hit score", expectedSD.score, actualSD.score, 0.0); if (expectedSD instanceof FieldDoc) { Assert.assertTrue(actualSD instanceof FieldDoc); Assert.assertArrayEquals( "wrong sort field values", ((FieldDoc) expectedSD).fields, ((FieldDoc) actualSD).fields); } else { Assert.assertFalse(actualSD instanceof FieldDoc); } } } // NOTE: this is likely buggy, and cannot clone fields // with tokenStreamValues, etc. Use at your own risk!! // TODO: is there a pre-existing way to do this!!! public static Document cloneDocument(Document doc1) { final Document doc2 = new Document(); for (IndexableField f : doc1.getFields()) { final Field field1 = (Field) f; final Field field2; final DocValuesType dvType = field1.fieldType().docValuesType(); final int dimCount = field1.fieldType().pointDimensionCount(); if (f instanceof KeywordField) { field2 = new KeywordField( f.name(), f.stringValue(), f.fieldType().stored() ? Field.Store.YES : Field.Store.NO); } else if (f instanceof IntField) { field2 = new IntField( f.name(), f.numericValue().intValue(), f.fieldType().stored() ? Field.Store.YES : Field.Store.NO); } else if (dvType != DocValuesType.NONE) { switch (dvType) { case NUMERIC: field2 = new NumericDocValuesField(field1.name(), field1.numericValue().longValue()); break; case BINARY: field2 = new BinaryDocValuesField(field1.name(), field1.binaryValue()); break; case SORTED: field2 = new SortedDocValuesField(field1.name(), field1.binaryValue()); break; case NONE: case SORTED_SET: case SORTED_NUMERIC: default: throw new IllegalStateException("unknown Type: " + dvType); } } else if (dimCount != 0) { BytesRef br = field1.binaryValue(); byte[] bytes = new byte[br.length]; System.arraycopy(br.bytes, br.offset, bytes, 0, br.length); field2 = new BinaryPoint(field1.name(), bytes, field1.fieldType()); } else { field2 = new Field(field1.name(), field1.stringValue(), field1.fieldType()); } doc2.add(field2); } return doc2; } // Returns a DocsEnum, but randomly sometimes uses a // DocsAndFreqsEnum, DocsAndPositionsEnum. Returns null // if field/term doesn't exist: public static PostingsEnum docs( Random random, IndexReader r, String field, BytesRef term, PostingsEnum reuse, int flags) throws IOException { final Terms terms = MultiTerms.getTerms(r, field); if (terms == null) { return null; } final TermsEnum termsEnum = terms.iterator(); if (!termsEnum.seekExact(term)) { return null; } return docs(random, termsEnum, reuse, flags); } // Returns a PostingsEnum with random features available public static PostingsEnum docs(Random random, TermsEnum termsEnum, PostingsEnum reuse, int flags) throws IOException { // TODO: simplify this method? it would be easier to randomly either use the flags passed, or do // the random selection, // FREQS should be part fo the random selection instead of outside on its own? if (random.nextBoolean()) { if (random.nextBoolean()) { final int posFlags; switch (random.nextInt(4)) { case 0: posFlags = PostingsEnum.POSITIONS; break; case 1: posFlags = PostingsEnum.OFFSETS; break; case 2: posFlags = PostingsEnum.PAYLOADS; break; default: posFlags = PostingsEnum.ALL; break; } return termsEnum.postings(null, posFlags); } flags |= PostingsEnum.FREQS; } return termsEnum.postings(reuse, flags); } public static CharSequence stringToCharSequence(String string, Random random) { return bytesToCharSequence(new BytesRef(string), random); } public static CharSequence bytesToCharSequence(BytesRef ref, Random random) { switch (random.nextInt(5)) { case 4: final char[] chars = new char[ref.length]; final int len = UnicodeUtil.UTF8toUTF16(ref.bytes, ref.offset, ref.length, chars); return new CharsRef(chars, 0, len); case 3: return CharBuffer.wrap(ref.utf8ToString()); default: return ref.utf8ToString(); } } /** Shutdown {@link ExecutorService} and wait for its. */ public static void shutdownExecutorService(ExecutorService ex) { if (ex != null) { try { ex.shutdown(); ex.awaitTermination(1, TimeUnit.SECONDS); } catch (InterruptedException e) { // Just report it on the syserr. System.err.println("Could not properly close executor service."); e.printStackTrace(System.err); } } } /** * Returns a valid (compiling) Pattern instance with random stuff inside. Be careful when applying * random patterns to longer strings as certain types of patterns may explode into exponential * times in backtracking implementations (such as Java's). */ public static Pattern randomPattern(Random random) { final String nonBmpString = "AB\uD840\uDC00C"; while (true) { try { Pattern p = Pattern.compile(TestUtil.randomRegexpishString(random)); String replacement = null; // ignore bugs in Sun's regex impl try { replacement = p.matcher(nonBmpString).replaceAll("_"); } catch ( @SuppressWarnings("unused") StringIndexOutOfBoundsException jdkBug) { System.out.println("WARNING: your jdk is buggy!"); System.out.println( "Pattern.compile(\"" + p.pattern() + "\").matcher(\"AB\\uD840\\uDC00C\").replaceAll(\"_\"); should not throw IndexOutOfBounds!"); } // Make sure the result of applying the pattern to a string with extended // unicode characters is a valid utf16 string. See LUCENE-4078 for discussion. if (replacement != null && UnicodeUtil.validUTF16String(replacement)) { return p; } } catch ( @SuppressWarnings("unused") PatternSyntaxException ignored) { // Loop trying until we hit something that compiles. } } } public static String randomAnalysisString(Random random, int maxLength, boolean simple) { assert maxLength >= 0; // sometimes just a purely random string if (random.nextInt(31) == 0) { return randomSubString(random, random.nextInt(maxLength), simple); } // otherwise, try to make it more realistic with 'words' since most tests use MockTokenizer // first decide how big the string will really be: 0..n maxLength = random.nextInt(maxLength); int avgWordLength = TestUtil.nextInt(random, 3, 8); StringBuilder sb = new StringBuilder(); while (sb.length() < maxLength) { if (sb.length() > 0) { sb.append(' '); } int wordLength = -1; while (wordLength < 0) { wordLength = (int) (random.nextGaussian() * 3 + avgWordLength); } wordLength = Math.min(wordLength, maxLength - sb.length()); sb.append(randomSubString(random, wordLength, simple)); } return sb.toString(); } public static String randomSubString(Random random, int wordLength, boolean simple) { if (wordLength == 0) { return ""; } int evilness = TestUtil.nextInt(random, 0, 20); StringBuilder sb = new StringBuilder(); while (sb.length() < wordLength) { if (simple) { sb.append( random.nextBoolean() ? TestUtil.randomSimpleString(random, wordLength) : TestUtil.randomHtmlishString(random, wordLength)); } else { if (evilness < 10) { sb.append(TestUtil.randomSimpleString(random, wordLength)); } else if (evilness < 15) { assert sb.isEmpty(); // we should always get wordLength back! sb.append(TestUtil.randomRealisticUnicodeString(random, wordLength, wordLength)); } else if (evilness == 16) { sb.append(TestUtil.randomHtmlishString(random, wordLength)); } else if (evilness == 17) { // gives a lot of punctuation sb.append(TestUtil.randomRegexpishString(random, wordLength)); } else { sb.append(TestUtil.randomUnicodeString(random, wordLength)); } } } if (sb.length() > wordLength) { sb.setLength(wordLength); if (Character.isHighSurrogate(sb.charAt(wordLength - 1))) { sb.setLength(wordLength - 1); } } if (random.nextInt(17) == 0) { // mix up case String mixedUp = TestUtil.randomlyRecaseCodePoints(random, sb.toString()); assert mixedUp.length() == sb.length(); return mixedUp; } else { return sb.toString(); } } /** * For debugging: tries to include br.utf8ToString(), but if that fails (because it's not valid * utf8, which is fine!), just use ordinary toString. */ public static String bytesRefToString(BytesRef br) { if (br == null) { return "(null)"; } else { try { return br.utf8ToString() + " " + br; } catch (@SuppressWarnings("unused") AssertionError | IllegalArgumentException t) { // If BytesRef isn't actually UTF8, or it's e.g. a // prefix of UTF8 that ends mid-unicode-char, we // fall back to hex: return br.toString(); } } } /** Returns a copy of the source directory, with file contents stored in RAM. */ public static Directory ramCopyOf(Directory dir) throws IOException { Directory ram = new ByteBuffersDirectory(); for (String file : dir.listAll()) { if (file.startsWith(IndexFileNames.SEGMENTS) || IndexFileNames.CODEC_FILE_PATTERN.matcher(file).matches()) { ram.copyFrom(dir, file, file, IOContext.DEFAULT); } } return ram; } public static boolean hasWindowsFS(Directory dir) { dir = FilterDirectory.unwrap(dir); if (dir instanceof FSDirectory) { Path path = ((FSDirectory) dir).getDirectory(); FileSystem fs = path.getFileSystem(); while (fs instanceof FilterFileSystem) { FilterFileSystem ffs = (FilterFileSystem) fs; if (ffs.provider() instanceof WindowsFS) { return true; } fs = ffs.getDelegate(); } } return false; } public static boolean hasWindowsFS(Path path) { FileSystem fs = path.getFileSystem(); while (fs instanceof FilterFileSystem) { FilterFileSystem ffs = (FilterFileSystem) fs; if (ffs.provider() instanceof WindowsFS) { return true; } fs = ffs.getDelegate(); } return false; } public static boolean hasVirusChecker(Directory dir) { dir = FilterDirectory.unwrap(dir); if (dir instanceof FSDirectory) { return hasVirusChecker(((FSDirectory) dir).getDirectory()); } else { return false; } } public static boolean hasVirusChecker(Path path) { FileSystem fs = path.getFileSystem(); while (fs instanceof FilterFileSystem) { FilterFileSystem ffs = (FilterFileSystem) fs; if (ffs.provider() instanceof VirusCheckingFS) { return true; } fs = ffs.getDelegate(); } return false; } /** Returns true if VirusCheckingFS is in use and was in fact already enabled */ public static boolean disableVirusChecker(Directory in) { Directory dir = FilterDirectory.unwrap(in); if (dir instanceof FSDirectory) { FileSystem fs = ((FSDirectory) dir).getDirectory().getFileSystem(); while (fs instanceof FilterFileSystem) { FilterFileSystem ffs = (FilterFileSystem) fs; if (ffs.provider() instanceof VirusCheckingFS) { VirusCheckingFS vfs = (VirusCheckingFS) ffs.provider(); boolean isEnabled = vfs.isEnabled(); vfs.disable(); return isEnabled; } fs = ffs.getDelegate(); } } return false; } public static void enableVirusChecker(Directory in) { Directory dir = FilterDirectory.unwrap(in); if (dir instanceof FSDirectory) { FileSystem fs = ((FSDirectory) dir).getDirectory().getFileSystem(); while (fs instanceof FilterFileSystem) { FilterFileSystem ffs = (FilterFileSystem) fs; if (ffs.provider() instanceof VirusCheckingFS) { VirusCheckingFS vfs = (VirusCheckingFS) ffs.provider(); vfs.enable(); return; } fs = ffs.getDelegate(); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy