All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.tests.util.LineFileDocs Maven / Gradle / Ivy

There is a newer version: 7.6.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.tests.util;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.KeywordField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;

/**
 * Minimal port of benchmark's LneDocSource + DocMaker, so tests can enum docs from a line file
 * created by benchmark's WriteLineDoc task
 */
public class LineFileDocs implements Closeable {
  /**
   * Converts date formats for europarl ("2023-02-23") and enwiki ("12-JAN-2010 12:32:45.000") into
   * {@link LocalDateTime}.
   */
  public static final Function DATE_FIELD_VALUE_TO_LOCALDATETIME =
      new Function<>() {
        final DateTimeFormatter euroParl =
            new DateTimeFormatterBuilder()
                .parseStrict()
                .parseCaseInsensitive()
                .appendPattern("uuuu-MM-dd")
                .toFormatter(Locale.ROOT);

        final DateTimeFormatter enwiki =
            new DateTimeFormatterBuilder()
                .parseStrict()
                .parseCaseInsensitive()
                .appendPattern("dd-MMM-uuuu HH:mm:ss['.'SSS]")
                .toFormatter(Locale.ROOT);

        @Override
        public LocalDateTime apply(String s) {
          if (s.matches("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")) {
            return euroParl.parse(s, LocalDate::from).atStartOfDay();
          } else {
            return enwiki.parse(s, LocalDateTime::from);
          }
        }
      };

  private BufferedReader reader;
  private static final int BUFFER_SIZE = 1 << 16; // 64K
  private final AtomicInteger id = new AtomicInteger();
  private final String path;
  private final Random random;

  /** If forever is true, we rewind the file at EOF (repeat the docs over and over) */
  public LineFileDocs(Random random, String path) throws IOException {
    this.path = path;
    this.random = new Random(random.nextLong());
    open();
  }

  public LineFileDocs(Random random) throws IOException {
    this(random, LuceneTestCase.TEST_LINE_DOCS_FILE);
  }

  @Override
  public synchronized void close() throws IOException {
    IOUtils.close(reader, threadDocs);
    reader = null;
  }

  private long randomSeekPos(Random random, long size) {
    if (random == null || size <= 3L) {
      return 0L;
    } else {
      return (random.nextLong() & Long.MAX_VALUE) % (size / 3);
    }
  }

  private synchronized void open() throws IOException {
    InputStream is = getClass().getResourceAsStream(path);

    // true if the InputStream is not already randomly seek'd after the if/else block below:
    boolean needSkip;

    long size = 0L, seekTo = 0L;
    if (is == null) {
      // if it's not in classpath, we load it as absolute filesystem path (e.g. Jenkins' home dir)
      Path file = Paths.get(path);
      size = Files.size(file);
      if (path.endsWith(".gz")) {
        // if it is a gzip file, we need to use InputStream and seek to one of the pre-computed skip
        // points:
        is = Files.newInputStream(file);
        needSkip = true;
      } else {
        // file is not compressed: optimized seek using SeekableByteChannel
        seekTo = randomSeekPos(random, size);
        final SeekableByteChannel channel = Files.newByteChannel(file);
        if (LuceneTestCase.VERBOSE) {
          System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
        }
        channel.position(seekTo);
        is = Channels.newInputStream(channel);

        // read until newline char, otherwise we may hit "java.nio.charset.MalformedInputException:
        // Input length = 1"
        // exception in readline() below, because we seeked part way through a multi-byte (in UTF-8)
        // encoded
        // unicode character:
        if (seekTo > 0L) {
          int b;
          do {
            b = is.read();
          } while (b >= 0 && b != 13 && b != 10);
        }

        needSkip = false;
      }
    } else {
      // if the file comes from Classpath:
      size = is.available();
      needSkip = true;
    }

    if (needSkip) {

      // LUCENE-9191: use the optimized (pre-computed, using
      // dev-tools/scripts/create_line_file_docs.py)
      // seek file, so we can seek in a gzip'd file

      int index = path.lastIndexOf('.');
      if (index == -1) {
        throw new IllegalArgumentException(
            "could not determine extension for path \"" + path + "\"");
      }

      // e.g. foo.txt --> foo.seek, foo.txt.gz --> foo.txt.seek
      String seekFilePath = path.substring(0, index) + ".seek";
      InputStream seekIS = getClass().getResourceAsStream(seekFilePath);
      if (seekIS == null) {
        seekIS = Files.newInputStream(Paths.get(seekFilePath));
      }

      try (BufferedReader reader =
          new BufferedReader(new InputStreamReader(seekIS, StandardCharsets.UTF_8))) {
        List skipPoints = new ArrayList<>();

        // explicitly insert implicit 0 as the first skip point:
        skipPoints.add(0L);

        while (true) {
          String line = reader.readLine();
          if (line == null) {
            break;
          }
          skipPoints.add(Long.parseLong(line.trim()));
        }

        seekTo = skipPoints.get(random.nextInt(skipPoints.size()));

        // dev-tools/scripts/create_line_file_docs.py ensures this is a "safe" skip point, and we
        // can begin gunziping from here:
        is.skip(seekTo);
        is = new GZIPInputStream(is);

        if (LuceneTestCase.VERBOSE) {
          System.out.println("TEST: LineFileDocs: stream skip to fp=" + seekTo + " on open");
        }
      }
    }

    CharsetDecoder decoder =
        StandardCharsets.UTF_8
            .newDecoder()
            .onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);
    reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
  }

  public synchronized void reset() throws IOException {
    reader.close();
    reader = null;
    open();
    id.set(0);
  }

  private static final char SEP = '\t';

  private static final class DocState {
    final Document doc;
    final Field titleTokenized;
    final Field title;
    final Field body;
    final Field id;
    final Field idNum;
    final Field date;
    final Field pageViews;

    public DocState() {
      doc = new Document();

      title = new KeywordField("title", "", Field.Store.NO);
      doc.add(title);

      FieldType ft = new FieldType(TextField.TYPE_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorOffsets(true);
      ft.setStoreTermVectorPositions(true);

      titleTokenized = new Field("titleTokenized", "", ft);
      doc.add(titleTokenized);

      body = new Field("body", "", ft);
      doc.add(body);

      id = new StringField("docid", "", Field.Store.YES);
      doc.add(id);

      idNum = new IntField("docid_int", 0, Field.Store.NO);
      doc.add(idNum);

      date = new StringField("date", "", Field.Store.YES);
      doc.add(date);

      // A numeric DV field that can be used for DV updates
      pageViews = new NumericDocValuesField("page_views", 0L);
      doc.add(pageViews);
    }
  }

  private final CloseableThreadLocal threadDocs = new CloseableThreadLocal<>();

  /** Note: Document instance is re-used per-thread */
  public Document nextDoc() throws IOException {
    String line;
    synchronized (this) {
      line = reader.readLine();
      if (line == null) {
        // Always rewind at end:
        if (LuceneTestCase.VERBOSE) {
          System.out.println("TEST: LineFileDocs: now rewind file...");
        }
        reader.close();
        reader = null;
        open();
        line = reader.readLine();
      }
    }

    DocState docState = threadDocs.get();
    if (docState == null) {
      docState = new DocState();
      threadDocs.set(docState);
    }

    int spot = line.indexOf(SEP);
    if (spot == -1) {
      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
    }
    int spot2 = line.indexOf(SEP, 1 + spot);
    if (spot2 == -1) {
      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
    }

    docState.body.setStringValue(line.substring(1 + spot2));
    final String title = line.substring(0, spot);
    docState.title.setStringValue(title);
    docState.titleTokenized.setStringValue(title);
    docState.date.setStringValue(line.substring(1 + spot, spot2));
    final int i = id.getAndIncrement();
    docState.id.setStringValue(Integer.toString(i));
    docState.idNum.setIntValue(i);
    docState.pageViews.setLongValue(random.nextInt(10_000));

    if (random.nextInt(5) == 4) {
      // Make some sparse fields
      Document doc = new Document();
      for (IndexableField field : docState.doc) {
        doc.add(field);
      }

      if (random.nextInt(3) == 1) {
        int x = random.nextInt(4);
        doc.add(new IntPoint("docLength" + x, line.length()));
      }

      if (random.nextInt(3) == 1) {
        int x = random.nextInt(4);
        doc.add(new IntPoint("docTitleLength" + x, title.length()));
      }

      if (random.nextInt(3) == 1) {
        int x = random.nextInt(4);
        doc.add(new NumericDocValuesField("docLength" + x, line.length()));
      }

      // TODO: more random sparse fields here too
    }

    return docState.doc;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy